MLIR: lib/Dialect/Affine/Transforms/LoopFusion.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
14
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/STLExtras.h"
29 #include "llvm/Support/CommandLine.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include
33 #include
34 #include
35
36 namespace mlir {
37 namespace affine {
38 #define GEN_PASS_DEF_AFFINELOOPFUSION
39 #include "mlir/Dialect/Affine/Passes.h.inc"
40 }
41 }
42
43 #define DEBUG_TYPE "affine-fusion"
44
45 using namespace mlir;
47
48 namespace {
49
50
51
52
53
54 struct LoopFusion : public affine::impl::AffineLoopFusionBase {
55 LoopFusion() = default;
56 LoopFusion(unsigned fastMemorySpace, uint64_t localBufSizeThresholdBytes,
57 bool maximalFusion, enum FusionMode affineFusionMode) {
58 this->fastMemorySpace = fastMemorySpace;
59 this->localBufSizeThreshold = localBufSizeThresholdBytes / 1024;
60 this->maximalFusion = maximalFusion;
61 this->affineFusionMode = affineFusionMode;
62 }
63
64 void runOnBlock(Block *block);
65 void runOnOperation() override;
66 };
67
68 }
69
70
71
72
73
74
75
76
81
83 bool hasOutDepsAfterFusion = false;
84
85 for (auto &outEdge : mdg.outEdges.lookup(srcId)) {
87
88 if (depNodeOp == dstNodeOp)
89 continue;
90
91
92
94 return false;
95
96
97
98 if (fusedLoopInsPoint != depNodeOp &&
100 LLVM_DEBUG(llvm::dbgs() << "Src loop can't be removed: dst loop doesn't "
101 "dominate dependence\n");
102 return false;
103 }
104
105 hasOutDepsAfterFusion = true;
106 }
107
108
109
110
111 if (hasOutDepsAfterFusion || !escapingMemRefs.empty()) {
112 std::optional isMaximal = fusionSlice.isMaximal();
113 if (!isMaximal) {
114 LLVM_DEBUG(llvm::dbgs() << "Src loop can't be removed: can't determine "
115 "if fusion is maximal\n");
116 return false;
117 }
118
119 if (!*isMaximal) {
120 LLVM_DEBUG(llvm::dbgs()
121 << "Src loop can't be removed: fusion is not maximal\n");
122 return false;
123 }
124 }
125
126 return true;
127 }
128
129
130
131
132
133
134
135
139
140 if (mdg.inEdges.count(dstId) == 0)
141 return;
142
143
144 auto *dstNode = mdg.getNode(dstId);
146 for (Operation *load : dstNode->loads)
147 consumedMemrefs.insert(cast(load).getMemRef());
148
149
150
151 for (const auto &srcEdge : mdg.inEdges.lookup(dstId)) {
152 const auto *srcNode = mdg.getNode(srcEdge.id);
153
154 if (!isa(srcNode->op))
155 continue;
156
157 if (any_of(srcNode->stores, [&](Operation *op) {
158 auto storeOp = cast(op);
159 return consumedMemrefs.count(storeOp.getMemRef()) > 0;
160 }))
161 srcIdCandidates.push_back(srcNode->id);
162 }
163
164 llvm::sort(srcIdCandidates);
165 srcIdCandidates.erase(llvm::unique(srcIdCandidates), srcIdCandidates.end());
166 }
167
168
169
170 static void
174 auto *dstNode = mdg.getNode(dstId);
175 auto *srcNode = mdg.getNode(srcId);
177 producerConsumerMemrefs);
178 }
179
180
181
182
183
184
185
186
189
190 if (!defOp)
191 return true;
192
193
194 if (auto viewOp = dyn_castmlir::ViewLikeOpInterface(defOp))
196 return true;
197
198
199 if (!hasSingleEffectmlir::MemoryEffects::Allocate(defOp, memref))
200 return true;
201
202
203
205
206 Operation *ancestorOp = block->getParent()->findAncestorOpInRegion(*user);
207 if (!ancestorOp)
208 return true;
209 if (ancestorOp->getBlock() != block)
210 return false;
211 return !isa(*user);
212 });
213 }
214
215
216
219 auto *node = mdg.getNode(id);
220 for (Operation *storeOp : node->stores) {
221 auto memref = cast(storeOp).getMemRef();
222 if (escapingMemRefs.count(memref))
223 continue;
225 escapingMemRefs.insert(memref);
226 }
227 }
228
229
230
231
232
233
235 assert(isa(node->op));
237 node->op = newRootForOp;
238 }
239
240
241
242
243
247 assert(!producerStores.empty() && "expected producer store");
248
249
250
251
252
253 Block *commonBlock = nullptr;
254
255 for (Operation *store : producerStores) {
257 !commonBlock ? &*sliceInsertionBlock->begin() : &*commonBlock->begin();
259 }
260 assert(commonBlock &&
261 "common block of producer stores and slice should exist");
262
263
264
265 Operation *firstAncestor = nullptr;
266 for (Operation *store : producerStores) {
268 assert(ancestor && "producer store should be contained in common block");
269 firstAncestor = !firstAncestor || ancestor->isBeforeInBlock(firstAncestor)
270 ? ancestor
271 : firstAncestor;
272 }
273 return firstAncestor;
274 }
275
276
277
278
279
281 AffineForOp srcForOp, AffineForOp dstForOp, unsigned depth,
283 int64_t &fusedLoopNestComputeCost) {
284 LLVM_DEBUG(llvm::dbgs() << "Determining additional compute fraction...\n";);
285
286
289 LLVM_DEBUG(llvm::dbgs() << "Failed to get source loop nest stats.\n");
290 return std::nullopt;
291 }
292
293
296 LLVM_DEBUG(llvm::dbgs() << "Failed to get destination loop nest stats.\n");
297 return std::nullopt;
298 }
299
300
301 uint64_t srcLoopNestCost = getComputeCost(srcForOp, srcLoopNestStats);
302
303
304 uint64_t dstLoopNestCost = getComputeCost(dstForOp, dstLoopNestStats);
305
307
309 LLVM_DEBUG(llvm::dbgs() << "Slice wasn't computed.\n");
310 return std::nullopt;
311 }
312
314 dstLoopNestStats, slice,
315 &fusedLoopNestComputeCost)) {
316 LLVM_DEBUG(llvm::dbgs() << "Unable to compute fusion compute cost\n");
317 return std::nullopt;
318 }
319
320 double additionalComputeFraction =
321 fusedLoopNestComputeCost /
322 (static_cast<double>(srcLoopNestCost) + dstLoopNestCost) -
323 1;
324
325 return additionalComputeFraction;
326 }
327
328
329
330
331
332
333
336 unsigned dstLoopDepth,
337 std::optional fastMemorySpace,
338 Block *sliceInsertionBlock,
339 uint64_t localBufSizeThreshold) {
340 assert(!storeOps.empty() && "no source stores supplied");
341
342
343
344
345
346 if (storeOps.size() > 1 &&
347 !std::equal(std::next(storeOps.begin()), storeOps.end(), storeOps.begin(),
349 MemRefAccess aM(cast(a));
350 MemRefAccess bM(cast(b));
351 return aM == bM;
352 })) {
353 LLVM_DEBUG(llvm::dbgs()
354 << "Private memref creation unsupported for multiple producer "
355 "stores with different access functions.\n");
356 return nullptr;
357 }
358
359 Operation *srcStoreOp = storeOps[0];
360
361
363
364 OpBuilder top(forOp->getParentRegion());
365
366 auto oldMemRef = cast(srcStoreOp).getMemRef();
367 auto oldMemRefType = cast(oldMemRef.getType());
368 unsigned rank = oldMemRefType.getRank();
369
370
372 bool validRegion = succeeded(
373 region.compute(srcStoreOp, dstLoopDepth, nullptr,
374 true, false));
375
376 (void)validRegion;
377 assert(validRegion && "unexpected memref region failure");
380 lbs.reserve(rank);
381
382
383 std::optional<int64_t> numElements =
385 assert(numElements && "non-constant number of elts in local buffer");
386
388
389
390
393
394
396 offsets.reserve(rank);
397
398
399
401 for (unsigned j = 0, e = lbs[0].getNumSymbols(); j < e; ++j)
403 for (unsigned d = 0; d < rank; ++d) {
404 assert(lbs[d].getNumResults() == 1 &&
405 "invalid private memref bound calculation");
406 offsets.push_back(lbs[d].getResult(0).replaceSymbols(replacements));
407 }
408
409
410
412 assert(eltSize && "memrefs with size elt types expected");
413 uint64_t bufSize = *eltSize * *numElements;
415 if (bufSize <= localBufSizeThreshold && fastMemorySpace.has_value()) {
416 newMemSpace = b.getI64IntegerAttr(*fastMemorySpace);
417 } else {
418 newMemSpace = oldMemRefType.getMemorySpace();
419 }
420 auto newMemRefType = MemRefType::get(newShape, oldMemRefType.getElementType(),
421 AffineMap(), newMemSpace);
422
423
424
425
426
427
428
429 Value newMemRef = top.creatememref::AllocOp(forOp.getLoc(), newMemRefType);
430
431
433 remapExprs.reserve(rank);
434 for (unsigned i = 0; i < rank; i++) {
435 auto dimExpr = b.getAffineDimExpr(outerIVs.size() + i);
436
437 auto remapExpr =
439 remapExprs.push_back(remapExpr);
440 }
441
442 auto indexRemap =
443 AffineMap::get(outerIVs.size() + rank, 0, remapExprs, forOp.getContext());
444
445
449 oldMemRef, newMemRef, {}, indexRemap,
450 outerIVs,
451 {}, domFilter);
452 assert(succeeded(res) &&
453 "replaceAllMemrefUsesWith should always succeed here");
454 (void)res;
455 LLVM_DEBUG(llvm::dbgs() << "Created private memref of type: " << newMemRefType
456 << '\n');
457 return newMemRef;
458 }
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
500 AffineForOp dstForOp,
502 unsigned maxLegalFusionDepth,
503 unsigned *dstLoopDepth,
504 double computeToleranceThreshold) {
505 LLVM_DEBUG({
506 llvm::dbgs()
507 << "Checking whether fusion is profitable between source nest:\n";
508 llvm::dbgs() << ' ' << srcForOp << " and destination nest:\n";
509 llvm::dbgs() << dstForOp << "\n";
510 });
511
512 if (maxLegalFusionDepth == 0) {
513 LLVM_DEBUG(llvm::dbgs() << "Can't fuse: maxLegalFusionDepth is 0\n");
514 return false;
515 }
516
517
518
519
522 return false;
523
524
527 return false;
528
529
530
531
532
533
534
535
536 if (producerStores.size() > 1) {
537 LLVM_DEBUG(llvm::dbgs() << "Limited profitability analysis. Not "
538 "supported for multiple producer store case.\n");
539 int64_t sliceCost;
540 int64_t fusedLoopNestComputeCost;
541
542
544 srcForOp, dstForOp, maxLegalFusionDepth, depthSliceUnions, sliceCost,
545 fusedLoopNestComputeCost);
546 if (!fraction || fraction > computeToleranceThreshold) {
547 LLVM_DEBUG(llvm::dbgs() << "Additional computation exceeds "
548 "compute tolerance. Not fusing.\n");
549 return false;
550 }
551 LLVM_DEBUG(llvm::dbgs()
552 << "Considering fusion profitable at max legal depth.\n");
553 return true;
554 }
555
556 Operation *srcStoreOp = producerStores.front();
557
558
559
560
561
562
563
565 double maxStorageReduction = 0.0;
566 std::optional<uint64_t> sliceMemEstimate;
567
568
569 std::optional bestDstLoopDepth;
570
571
573 if (failed(srcWriteRegion.compute(srcStoreOp, 0))) {
574 LLVM_DEBUG(llvm::dbgs()
575 << "Unable to compute MemRefRegion for source operation\n");
576 return false;
577 }
578
579 std::optional<int64_t> maybeSrcWriteRegionSizeBytes =
581 if (!maybeSrcWriteRegionSizeBytes.has_value())
582 return false;
583 int64_t srcWriteRegionSizeBytes = *maybeSrcWriteRegionSizeBytes;
584
585
586 uint64_t srcLoopNestCost = getComputeCost(srcForOp, srcLoopNestStats);
587
588
589 uint64_t dstLoopNestCost = getComputeCost(dstForOp, dstLoopNestStats);
590
591
592
593 for (unsigned i = maxLegalFusionDepth; i >= 1; --i) {
595
597 continue;
598
599
600
601 int64_t sliceCost;
602
603 int64_t fusedLoopNestComputeCost;
604
605 auto mayAdditionalComputeFraction =
607 sliceCost, fusedLoopNestComputeCost);
608 if (!mayAdditionalComputeFraction) {
609 LLVM_DEBUG(llvm::dbgs()
610 << "Can't determine additional compute fraction.\n");
611 continue;
612 }
613 double additionalComputeFraction = *mayAdditionalComputeFraction;
614
615
616
617
619 if (failed(sliceWriteRegion.compute(srcStoreOp, 0, &slice))) {
620 LLVM_DEBUG(llvm::dbgs()
621 << "Failed to compute slice write region at loopDepth: " << i
622 << "\n");
623 continue;
624 }
625
626 std::optional<int64_t> maybeSliceWriteRegionSizeBytes =
628 if (!maybeSliceWriteRegionSizeBytes.has_value() ||
629 *maybeSliceWriteRegionSizeBytes == 0) {
630 LLVM_DEBUG(llvm::dbgs()
631 << "Failed to get slice write region size at loopDepth: " << i
632 << "\n");
633 continue;
634 }
635 int64_t sliceWriteRegionSizeBytes = *maybeSliceWriteRegionSizeBytes;
636
637 double storageReduction = static_cast<double>(srcWriteRegionSizeBytes) /
638 static_cast<double>(sliceWriteRegionSizeBytes);
639
640 LLVM_DEBUG({
641 std::stringstream msg;
642 msg << " evaluating fusion profitability at depth : " << i << "\n"
643 << std::fixed << std::setprecision(2)
644 << " additional compute fraction: "
645 << 100.0 * additionalComputeFraction << "%\n"
646 << " storage reduction factor: " << storageReduction << "x\n"
647 << " fused nest cost: " << fusedLoopNestComputeCost << "\n"
648 << " src write region size: " << srcWriteRegionSizeBytes << "\n"
649 << " slice write region size: " << sliceWriteRegionSizeBytes
650 << "\n";
651 llvm::dbgs() << msg.str();
652 });
653
654
655
656
657
658 if ((storageReduction > maxStorageReduction) &&
659 (additionalComputeFraction <= computeToleranceThreshold)) {
660 maxStorageReduction = storageReduction;
661 bestDstLoopDepth = i;
662 minFusedLoopNestComputeCost = fusedLoopNestComputeCost;
663 sliceMemEstimate = sliceWriteRegionSizeBytes;
664 }
665 }
666
667
668
669 if (!bestDstLoopDepth) {
670 LLVM_DEBUG(
671 llvm::dbgs()
672 << "All fusion choices involve more than the threshold amount of "
673 "redundant computation; NOT fusing.\n");
674 return false;
675 }
676
677 if (!bestDstLoopDepth) {
678 LLVM_DEBUG(llvm::dbgs() << "no fusion depth could be evaluated.\n");
679 return false;
680 }
681
682
683 *dstLoopDepth = *bestDstLoopDepth;
684
685 LLVM_DEBUG(
686 llvm::dbgs() << " LoopFusion fusion stats:"
687 << "\n best loop depth: " << bestDstLoopDepth
688 << "\n src loop nest compute cost: " << srcLoopNestCost
689 << "\n dst loop nest compute cost: " << dstLoopNestCost
690 << "\n fused loop nest compute cost: "
691 << minFusedLoopNestComputeCost << "\n");
692
695
696 std::optional storageReduction;
697
698 if (!dstMemSize || !srcMemSize) {
699 LLVM_DEBUG(llvm::dbgs()
700 << " fusion memory benefit cannot be evaluated; NOT fusing.\n");
701 return false;
702 }
703
704 auto srcMemSizeVal = *srcMemSize;
705 auto dstMemSizeVal = *dstMemSize;
706
707 assert(sliceMemEstimate && "expected value");
708 auto fusedMem = dstMemSizeVal + *sliceMemEstimate;
709
710 LLVM_DEBUG(llvm::dbgs() << " src mem: " << srcMemSizeVal << "\n"
711 << " dst mem: " << dstMemSizeVal << "\n"
712 << " fused mem: " << fusedMem << "\n"
713 << " slice mem: " << sliceMemEstimate << "\n");
714
715 if (static_cast<long>(fusedMem) > srcMemSizeVal + dstMemSizeVal) {
716 LLVM_DEBUG(llvm::dbgs() << "Fusion is not profitable; NOT fusing.\n");
717 return false;
718 }
719 storageReduction =
720 100.0 *
721 (1.0 - fusedMem / (static_cast<double>(srcMemSizeVal) + dstMemSizeVal));
722
723 double additionalComputeFraction =
724 100.0 * (minFusedLoopNestComputeCost /
725 (static_cast<double>(srcLoopNestCost) + dstLoopNestCost) -
726 1);
727 (void)additionalComputeFraction;
728 LLVM_DEBUG({
729 std::stringstream msg;
730 msg << " fusion is most profitable at depth " << *dstLoopDepth << " with "
731 << std::setprecision(2) << additionalComputeFraction
732 << "% redundant computation and a ";
733 msg << (storageReduction ? std::to_string(*storageReduction) : "");
734 msg << "% storage reduction.\n";
735 llvm::dbgs() << msg.str();
736 });
737
738 return true;
739 }
740
741 namespace {
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789 struct GreedyFusion {
790 public:
791
793
795
796 unsigned localBufSizeThreshold;
797
798 std::optional fastMemorySpace;
799
800
801 bool maximalFusion;
802
803
804 double computeToleranceThreshold;
805
807
809 std::optional fastMemorySpace, bool maximalFusion,
810 double computeToleranceThreshold)
811 : mdg(mdg), localBufSizeThreshold(localBufSizeThreshold),
812 fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion),
813 computeToleranceThreshold(computeToleranceThreshold) {}
814
815
816 void init() {
817
818
819 worklist.clear();
820 for (auto &idAndNode : mdg->nodes) {
821 const Node &node = idAndNode.second;
822 worklist.push_back(node.id);
823 }
824 }
825
826 void runSiblingFusionOnly() {
827 fuseSiblingNodes();
828 eraseUnusedMemRefAllocations();
829 }
830
831
832 void runProducerConsumerFusionOnly() {
833 fuseProducerConsumerNodes(
835 eraseUnusedMemRefAllocations();
836 }
837
838
839
840
841
842
843 void runGreedyFusion() {
844
845 fuseProducerConsumerNodes(1);
846 fuseSiblingNodes();
847 fuseProducerConsumerNodes(
849 eraseUnusedMemRefAllocations();
850 }
851
852
853
854 bool canCreatePrivateMemRef(Value memref,
856 unsigned producerId, unsigned consumerId,
857 bool removeSrcNode) {
858
860 return false;
861 const Node *consumerNode = mdg->getNode(consumerId);
862
863
864
865
866
867
868
869 if (srcEscapingMemRefs.count(memref) > 0 &&
870 (removeSrcNode || consumerNode->getStoreOpCount(memref) > 0))
871 return false;
872
873
874
877 return false;
878
879
880
881
882 if (removeSrcNode &&
883 any_of(mdg->outEdges[producerId], [&](const auto &edge) {
884 return edge.value == memref && edge.id != consumerId;
885 }))
886 return false;
887
888 return true;
889 }
890
891
892
893
894 void performFusionsIntoDest(unsigned dstId, unsigned maxSrcUserCount) {
895 LLVM_DEBUG(llvm::dbgs() << "Evaluating dst loop " << dstId << "\n");
896
897 if (mdg->nodes.count(dstId) == 0)
898 return;
899
900 auto *dstNode = mdg->getNode(dstId);
901
902 if (!isa(dstNode->op))
903 return;
904
905
906 if (dstNode->op->getNumResults() > 0)
907 return;
908
909 LLVM_DEBUG(llvm::dbgs() << "Evaluating dst loop " << dstId << "\n");
910
911
912
913
914
916 auto dstAffineForOp = cast(dstNode->op);
917
918
919
920 bool dstNodeChanged;
921 do {
922
923
924
925
926
927 dstNodeChanged = false;
930
931 for (unsigned srcId : llvm::reverse(srcIdCandidates)) {
932
933 auto *srcNode = mdg->getNode(srcId);
934 auto srcAffineForOp = cast(srcNode->op);
935
936 LLVM_DEBUG(llvm::dbgs()
937 << "Trying to fuse producer loop nest " << srcId
938 << " with consumer loop nest " << dstId << "\n");
939 LLVM_DEBUG(llvm::dbgs() << "Compute tolerance threshold: "
940 << computeToleranceThreshold << '\n');
941 LLVM_DEBUG(llvm::dbgs()
942 << "Producer loop nest:\n"
943 << *srcNode->op << "\n and consumer loop nest:\n"
944 << *dstNode->op << '\n');
945
946 LLVM_DEBUG(llvm::dbgs() << "Evaluating src loop " << srcId
947 << " for dst loop " << dstId << "\n");
948
949
950
951 if (isa(srcNode->op) && srcNode->op->getNumResults() > 0)
952 continue;
953
956 producerConsumerMemrefs);
957
958
959
960 if (any_of(producerConsumerMemrefs, [&](Value memref) {
962 maxSrcUserCount;
963 }))
964 continue;
965
966
967
968
971
972
973
976 if (fusedLoopInsPoint == nullptr)
977 continue;
978
979
980
981
982
983
984
987 unsigned numSurroundingLoops = surroundingLoops.size();
988
989
990
992 for (Operation *op : dstNode->loads)
993 if (producerConsumerMemrefs.count(
994 cast(op).getMemRef()) > 0)
995 dstMemrefOps.push_back(op);
996 for (Operation *op : dstNode->stores)
997 if (producerConsumerMemrefs.count(
998 cast(op).getMemRef()))
999 dstMemrefOps.push_back(op);
1000 unsigned dstLoopDepthTest =
1002
1003
1004
1005 unsigned maxLegalFusionDepth = 0;
1007 depthSliceUnions.resize(dstLoopDepthTest);
1009 for (unsigned i = 1; i <= dstLoopDepthTest; ++i) {
1012 i + numSurroundingLoops,
1013 &depthSliceUnions[i - 1], strategy);
1015 maxLegalFusionDepth = i;
1016 LLVM_DEBUG(llvm::dbgs()
1017 << "Found valid slice for depth: " << i << '\n');
1018 }
1019 }
1020
1021 if (maxLegalFusionDepth == 0) {
1022 LLVM_DEBUG(llvm::dbgs()
1023 << "Can't fuse: fusion is not legal at any depth\n");
1024 continue;
1025 }
1026
1027 LLVM_DEBUG(llvm::dbgs() << "Max legal depth for fusion: "
1028 << maxLegalFusionDepth << '\n');
1029
1030 double computeToleranceThresholdToUse = computeToleranceThreshold;
1031
1032
1033
1034
1035
1036
1038 LLVM_DEBUG(llvm::dbgs() << "Source nest has a cyclic dependence.\n");
1039
1040
1041
1042 if (maximalFusion) {
1043 auto srcForOp = cast(srcNode->op);
1044 auto dstForOp = cast(dstNode->op);
1045 int64_t sliceCost;
1046 int64_t fusedLoopNestComputeCost;
1048 srcForOp, dstForOp, maxLegalFusionDepth, depthSliceUnions,
1049 sliceCost, fusedLoopNestComputeCost);
1050 if (!fraction || fraction > 0) {
1051 LLVM_DEBUG(
1052 llvm::dbgs()
1053 << "Can't perform maximal fusion with a cyclic dependence "
1054 "and non-zero additional compute.\n");
1055 return;
1056 }
1057 } else {
1058
1059
1060 LLVM_DEBUG(llvm::dbgs()
1061 << "Setting compute tolerance to zero since "
1062 "source has a cylic dependence.\n");
1063 computeToleranceThresholdToUse = 0;
1064 }
1065 }
1066
1067
1068
1069
1070 unsigned bestDstLoopDepth = maxLegalFusionDepth;
1071 if (!maximalFusion) {
1072
1074 for (Operation *op : srcNode->stores)
1075 if (producerConsumerMemrefs.count(
1076 cast(op).getMemRef()))
1077 producerStores.push_back(op);
1078
1079 assert(!producerStores.empty() && "Expected producer store");
1081 dstAffineForOp, depthSliceUnions,
1082 maxLegalFusionDepth, &bestDstLoopDepth,
1083 computeToleranceThresholdToUse)) {
1084 continue;
1085 }
1086 }
1087
1088 assert(bestDstLoopDepth > 0 && "Unexpected loop fusion depth");
1090 depthSliceUnions[bestDstLoopDepth - 1];
1091 assert(!bestSlice.isEmpty() && "Missing slice union for depth");
1092
1093
1094
1095
1097 srcId, dstId, bestSlice, fusedLoopInsPoint, srcEscapingMemRefs,
1098 *mdg);
1099
1101 for (Value memref : producerConsumerMemrefs) {
1102 if (canCreatePrivateMemRef(memref, srcEscapingMemRefs, srcId, dstId,
1103 removeSrcNode)) {
1104
1105 LLVM_DEBUG(llvm::dbgs()
1106 << "Creating private memref for " << memref << '\n');
1107
1108 privateMemrefs.insert(memref);
1109 }
1110 }
1111
1112
1113 fuseLoops(srcAffineForOp, dstAffineForOp, bestSlice);
1114 dstNodeChanged = true;
1115
1116 LLVM_DEBUG(llvm::dbgs()
1117 << "Fused src loop " << srcId << " into dst loop " << dstId
1118 << " at depth " << bestDstLoopDepth << ":\n"
1119 << dstAffineForOp << "\n");
1120
1121
1122 if (fusedLoopInsPoint != dstAffineForOp)
1123 dstAffineForOp->moveBefore(fusedLoopInsPoint);
1124
1125
1126 mdg->updateEdges(srcNode->id, dstNode->id, privateMemrefs,
1127 removeSrcNode);
1128
1129
1130 if (!privateMemrefs.empty()) {
1131
1132
1133 Block *sliceInsertionBlock = bestSlice.insertPoint->getBlock();
1134
1135
1137 dstAffineForOp.walk([&](AffineWriteOpInterface storeOp) {
1138 Value storeMemRef = storeOp.getMemRef();
1139 if (privateMemrefs.count(storeMemRef) > 0)
1140 privateMemRefToStores[storeMemRef].push_back(storeOp);
1141 });
1142
1143
1144
1145
1146
1147 for (auto &memrefToStoresPair : privateMemRefToStores) {
1150 dstAffineForOp, storesForMemref, bestDstLoopDepth,
1151 fastMemorySpace, sliceInsertionBlock, localBufSizeThreshold);
1152 if (!newMemRef)
1153 continue;
1154
1156
1157 mdg->addEdge(newMemRefNodeId, dstId, newMemRef);
1158 }
1159
1160
1161
1162 dstNode = mdg->getNode(dstId);
1163 }
1164
1165
1167 dstLoopCollector.collect(dstAffineForOp);
1168
1169
1175
1176 if (removeSrcNode) {
1177 LLVM_DEBUG(llvm::dbgs()
1178 << "Removing src loop " << srcId << " after fusion\n");
1179
1180 srcAffineForOp.erase();
1182 srcNode = nullptr;
1183 }
1184 }
1185 } while (dstNodeChanged);
1186 }
1187
1188
1189
1190
1191
1192 void fuseProducerConsumerNodes(unsigned maxSrcUserCount) {
1193 LLVM_DEBUG(llvm::dbgs() << "--- Producer/Consumer Fusion ---\n");
1194 init();
1195 while (!worklist.empty()) {
1196 unsigned dstId = worklist.back();
1197 worklist.pop_back();
1198 performFusionsIntoDest(dstId, maxSrcUserCount);
1199 }
1200 }
1201
1202
1203
1204 void fuseSiblingNodes() {
1205 LLVM_DEBUG(llvm::dbgs() << "--- Sibling Fusion ---\n");
1206 init();
1207 while (!worklist.empty()) {
1208 unsigned dstId = worklist.back();
1209 worklist.pop_back();
1210
1211
1212 if (mdg->nodes.count(dstId) == 0)
1213 continue;
1214
1215 auto *dstNode = mdg->getNode(dstId);
1216
1217 if (!isa(dstNode->op))
1218 continue;
1219
1220 fuseWithSiblingNodes(dstNode);
1221 }
1222 }
1223
1224
1225 void fuseWithSiblingNodes(Node *dstNode) {
1227 std::pair<unsigned, Value> idAndMemref;
1228 auto dstAffineForOp = cast(dstNode->op);
1229
1230 while (findSiblingNodeToFuse(dstNode, &visitedSibNodeIds, &idAndMemref)) {
1231 unsigned sibId = idAndMemref.first;
1232 Value memref = idAndMemref.second;
1233
1234
1235 auto *sibNode = mdg->getNode(sibId);
1236
1237
1238 assert(sibNode->op->getBlock() == dstNode->op->getBlock());
1243 if (insertPointInst == nullptr)
1244 continue;
1245
1246
1247
1248
1250 sibNode->getLoadOpsForMemref(memref, &sibLoadOpInsts);
1251
1252 Operation *sibLoadOpInst = llvm::getSingleElement(sibLoadOpInsts);
1253
1254
1256 dstNode->getLoadOpsForMemref(memref, &dstLoadOpInsts);
1257
1258
1259
1260
1261
1262
1263
1266 unsigned numSurroundingLoops = surroundingLoops.size();
1269 unsigned dstLoopDepthTest = dstLoopIVs.size() - numSurroundingLoops;
1270 auto sibAffineForOp = cast(sibNode->op);
1271
1272
1274 depthSliceUnions.resize(dstLoopDepthTest);
1275 unsigned maxLegalFusionDepth = 0;
1277 for (unsigned i = 1; i <= dstLoopDepthTest; ++i) {
1280 i + numSurroundingLoops,
1281 &depthSliceUnions[i - 1], strategy);
1282
1284 maxLegalFusionDepth = i;
1285 }
1286
1287 LLVM_DEBUG(llvm::dbgs() << "Max legal depth for fusion: "
1288 << maxLegalFusionDepth << '\n');
1289
1290
1291 if (maxLegalFusionDepth == 0)
1292 continue;
1293
1294 double computeToleranceThresholdToUse = computeToleranceThreshold;
1295
1296
1297
1298
1299
1300
1302 LLVM_DEBUG(llvm::dbgs() << "Source nest has a cyclic dependence.\n");
1303
1304
1305
1306 if (maximalFusion) {
1307 auto dstForOp = cast(dstNode->op);
1308 int64_t sliceCost;
1309 int64_t fusedLoopNestComputeCost;
1311 sibAffineForOp, dstForOp, maxLegalFusionDepth, depthSliceUnions,
1312 sliceCost, fusedLoopNestComputeCost);
1313 if (!fraction || fraction > 0) {
1314 LLVM_DEBUG(
1315 llvm::dbgs()
1316 << "Can't perform maximal fusion with a cyclic dependence "
1317 "and non-zero additional compute.\n");
1318 return;
1319 }
1320 } else {
1321
1322
1323 LLVM_DEBUG(llvm::dbgs() << "Setting compute tolerance to zero since "
1324 "source has a cyclic dependence.\n");
1325 computeToleranceThresholdToUse = 0.0;
1326 }
1327 }
1328
1329 unsigned bestDstLoopDepth = maxLegalFusionDepth;
1330 if (!maximalFusion) {
1331
1332
1333
1334
1335 if ((sibAffineForOp, sibLoadOpInst, dstAffineForOp,
1336 depthSliceUnions, maxLegalFusionDepth,
1337 &bestDstLoopDepth,
1338 computeToleranceThresholdToUse))
1339 continue;
1340 }
1341
1342 assert(bestDstLoopDepth > 0 && "Unexpected loop fusion depth");
1343
1345 depthSliceUnions[bestDstLoopDepth - 1];
1346 assert(!bestSlice.isEmpty() &&
1347 "Fusion depth has no computed slice union");
1348
1349
1350
1351
1352 auto isMaximal = bestSlice.isMaximal();
1353 if (!isMaximal.value_or(false)) {
1354 LLVM_DEBUG(llvm::dbgs()
1355 << "Slice isn't maximal; not performing sibling fusion.\n");
1356 continue;
1357 }
1358
1359
1360
1361
1362 bool isInnermostInsertion = (bestDstLoopDepth == dstLoopDepthTest);
1363
1365 isInnermostInsertion);
1366
1367 auto dstForInst = cast(dstNode->op);
1368
1369 if (insertPointInst != dstForInst)
1370 dstForInst->moveBefore(insertPointInst);
1371
1372 LLVM_DEBUG(llvm::dbgs()
1373 << "Fused sibling nest " << sibId << " into destination nest "
1374 << dstNode->id << " at depth " << bestDstLoopDepth << ":\n"
1375 << dstAffineForOp << "\n");
1376
1377
1378 updateStateAfterSiblingFusion(sibNode, dstNode);
1379
1380
1381
1385 }
1386 }
1387
1388
1389
1390
1391
1392 bool findSiblingNodeToFuse(Node *dstNode,
1394 std::pair<unsigned, Value> *idAndMemrefToFuse) {
1395
1396
1397 auto canFuseWithSibNode = [&](Node *sibNode, Value memref) {
1398
1399
1400 if (sibNode->getLoadOpCount(memref) != 1)
1401 return false;
1402
1403
1406 return false;
1407
1408
1410 sibNode->getLoadAndStoreMemrefSet(&loadAndStoreMemrefSet);
1411 if (llvm::any_of(loadAndStoreMemrefSet, [=](Value memref) {
1413 }))
1414 return false;
1415
1416
1418 for (auto *storeOpInst : sibNode->stores) {
1419 storeMemrefs.insert(
1420 cast(storeOpInst).getMemRef());
1421 }
1422 return storeMemrefs.size() <= 1;
1423 };
1424
1425
1426 Block *block = dstNode->op->getBlock();
1427 for (unsigned i = 0, e = block->getNumArguments(); i != e; ++i) {
1429 auto loadOp = dyn_cast(user);
1430 if (!loadOp)
1431 continue;
1432
1435
1436
1437
1438 auto *it = llvm::find_if(loops, [&](AffineForOp loop) {
1439 return loop->getBlock() == &mdg->block;
1440 });
1441
1442 if (it == loops.end())
1443 continue;
1445 assert(sibNode != nullptr);
1446
1447 if (sibNode->id == dstNode->id)
1448 continue;
1449
1450 if (visitedSibNodeIds->count(sibNode->id) > 0)
1451 continue;
1452
1453 auto memref = loadOp.getMemRef();
1454 if (dstNode->getLoadOpCount(memref) == 0)
1455 continue;
1456
1457 if (canFuseWithSibNode(sibNode, memref)) {
1458 visitedSibNodeIds->insert(sibNode->id);
1459 idAndMemrefToFuse->first = sibNode->id;
1460 idAndMemrefToFuse->second = memref;
1461 return true;
1462 }
1463 }
1464 }
1465
1466
1467
1471
1472 if (dstNode->getLoadOpCount(inEdge.value) > 0 &&
1473 mdg->getNode(inEdge.id)->getStoreOpCount(inEdge.value) > 0)
1474 inEdges.push_back(inEdge);
1475 });
1476
1477
1478
1479 for (auto &inEdge : inEdges) {
1480
1484 unsigned sibNodeId = outEdge.id;
1485 if (visitedSibNodeIds->count(sibNodeId) > 0)
1486 return;
1487
1488 if (outEdge.id == dstNode->id || outEdge.value != inEdge.value)
1489 return;
1490 auto *sibNode = mdg->getNode(sibNodeId);
1491 if (!isa(sibNode->op))
1492 return;
1493
1494 if (canFuseWithSibNode(sibNode, outEdge.value)) {
1495
1496 outEdges.push_back(outEdge);
1497 }
1498 });
1499
1500
1501 if (!outEdges.empty()) {
1502 visitedSibNodeIds->insert(outEdges[0].id);
1503 idAndMemrefToFuse->first = outEdges[0].id;
1504 idAndMemrefToFuse->second = outEdges[0].value;
1505 return true;
1506 }
1507 }
1508 return false;
1509 }
1510
1511
1512
1513 void updateStateAfterSiblingFusion(Node *sibNode, Node *dstNode) {
1514
1515 mdg->updateEdges(sibNode->id, dstNode->id);
1516
1517
1518 auto dstForInst = cast(dstNode->op);
1520 dstLoopCollector.collect(dstForInst);
1521
1526 }
1527
1528
1529 void eraseUnusedMemRefAllocations() {
1531 if (pair.second > 0)
1532 continue;
1533 auto memref = pair.first;
1534
1536 continue;
1537
1539 if (isa_and_nonnullmemref::AllocOp(op))
1541 }
1542 }
1543 };
1544
1545 }
1546
1547
1548 void LoopFusion::runOnBlock(Block *block) {
1550 if (!g.init()) {
1551 LLVM_DEBUG(llvm::dbgs() << "MDG init failed\n");
1552 return;
1553 }
1554
1555 std::optional fastMemorySpaceOpt;
1556 if (fastMemorySpace.hasValue())
1557 fastMemorySpaceOpt = fastMemorySpace;
1558 unsigned localBufSizeThresholdBytes = localBufSizeThreshold * 1024;
1559 GreedyFusion fusion(&g, localBufSizeThresholdBytes, fastMemorySpaceOpt,
1560 maximalFusion, computeToleranceThreshold);
1561
1563 fusion.runProducerConsumerFusionOnly();
1565 fusion.runSiblingFusionOnly();
1566 else
1567 fusion.runGreedyFusion();
1568 }
1569
1570 void LoopFusion::runOnOperation() {
1571
1572
1573 getOperation()->walk([&](Operation *op) {
1575 for (Block &block : region.getBlocks()) {
1576 auto affineFors = block.getOps();
1577 if (!affineFors.empty() && !llvm::hasSingleElement(affineFors))
1578 runOnBlock(&block);
1579 }
1580 }
1581 });
1582 }
1583
1585 unsigned fastMemorySpace, uint64_t localBufSizeThreshold,
1586 bool maximalFusion, enum FusionMode affineFusionMode) {
1587 return std::make_unique(fastMemorySpace, localBufSizeThreshold,
1588 maximalFusion, affineFusionMode);
1589 }
MemRefDependenceGraph::Node Node
static Operation * getDominanceFilterForPrivateMemRefRepl(Block *sliceInsertionBlock, ArrayRef< Operation * > producerStores)
Get the operation that should act as a dominance filter while replacing memref uses with a private me...
static void getProducerCandidates(unsigned dstId, const MemRefDependenceGraph &mdg, SmallVectorImpl< unsigned > &srcIdCandidates)
Returns in 'srcIdCandidates' the producer fusion candidates for consumer 'dstId'.
static bool isFusionProfitable(AffineForOp srcForOp, ArrayRef< Operation * > producerStores, AffineForOp dstForOp, ArrayRef< ComputationSliceState > depthSliceUnions, unsigned maxLegalFusionDepth, unsigned *dstLoopDepth, double computeToleranceThreshold)
static bool isEscapingMemref(Value memref, Block *block)
A memref escapes in the context of the fusion pass if either:
static bool canRemoveSrcNodeAfterFusion(unsigned srcId, unsigned dstId, const ComputationSliceState &fusionSlice, Operation *fusedLoopInsPoint, const DenseSet< Value > &escapingMemRefs, const MemRefDependenceGraph &mdg)
Returns true if node 'srcId' can be removed after fusing it with node 'dstId'.
static Value createPrivateMemRef(AffineForOp forOp, ArrayRef< Operation * > storeOps, unsigned dstLoopDepth, std::optional< unsigned > fastMemorySpace, Block *sliceInsertionBlock, uint64_t localBufSizeThreshold)
static void gatherEscapingMemrefs(unsigned id, const MemRefDependenceGraph &mdg, DenseSet< Value > &escapingMemRefs)
Returns in 'escapingMemRefs' the memrefs from affine store ops in node 'id' that escape the block or ...
static std::optional< double > getAdditionalComputeFraction(AffineForOp srcForOp, AffineForOp dstForOp, unsigned depth, ArrayRef< ComputationSliceState > depthSliceUnions, int64_t &sliceCost, int64_t &fusedLoopNestComputeCost)
Returns the amount of additional (redundant) computation that will be done as a fraction of the total...
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
Attributes are known-constant values of operations.
Block represents an ordered list of Operations.
Operation * findAncestorOpInBlock(Operation &op)
Returns 'op' if 'op' lies in this block, or otherwise finds the ancestor operation of 'op' that lies ...
BlockArgument getArgument(unsigned i)
unsigned getNumArguments()
void getValues(unsigned start, unsigned end, SmallVectorImpl< Value > *values) const
Returns the Values associated with variables in range [start, end).
This class helps build Operations.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Operation is the basic unit of execution within MLIR.
bool isBeforeInBlock(Operation *other)
Given an operation 'other' that is within the same parent block, return whether the current operation...
Location getLoc()
The source location the operation was defined or derived from.
Block * getBlock()
Returns the operation block that contains this operation.
MutableArrayRef< Region > getRegions()
Returns the regions held by this operation.
void erase()
Remove this operation from its parent block and delete it.
This class contains a list of basic blocks and a link to the parent operation it is attached to.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
Type getType() const
Return the type of this value.
user_range getUsers() const
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
FlatAffineValueConstraints is an extension of FlatLinearValueConstraints with helper functions for Af...
Describes the fusion strategy to be used in the Affine loop fusion utilities.
unsigned getNumDimAndSymbolVars() const
bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats, AffineForOp dstForOp, LoopNestStats &dstStats, const ComputationSliceState &slice, int64_t *computeCost)
Computes and returns in 'computeCost', the total compute cost of fusing the 'slice' of the loop nest ...
void gatherProducerConsumerMemrefs(ArrayRef< Operation * > srcOps, ArrayRef< Operation * > dstOps, DenseSet< Value > &producerConsumerMemrefs)
Returns in 'producerConsumerMemrefs' the memrefs involved in a producer-consumer dependence between w...
int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats)
Computes the total cost of the loop nest rooted at 'forOp' using 'stats'.
void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, const ComputationSliceState &srcSlice, bool isInnermostSiblingInsertionFusion=false)
Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point and source slice loop bo...
void getAffineForIVs(Operation &op, SmallVectorImpl< AffineForOp > *loops)
Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered from the outermost 'affine....
std::optional< int64_t > getMemoryFootprintBytes(AffineForOp forOp, int memorySpace=-1)
Gets the memory footprint of all data touched in the specified memory space in bytes; if the memory s...
std::unique_ptr< Pass > createLoopFusionPass(unsigned fastMemorySpace=0, uint64_t localBufSizeThreshold=0, bool maximalFusion=false, enum FusionMode fusionMode=FusionMode::Greedy)
Creates a loop fusion pass which fuses affine loop nests at the top-level of the operation the pass i...
FusionMode
Fusion mode to attempt.
unsigned getInnermostCommonLoopDepth(ArrayRef< Operation * > ops, SmallVectorImpl< AffineForOp > *surroundingLoops=nullptr)
Returns the innermost common loop depth for the set of operations in 'ops'.
bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats)
Collect loop nest statistics (eg.
AffineForOp sinkSequentialLoops(AffineForOp forOp)
bool hasCyclicDependence(AffineForOp root)
Returns true if the affine nest rooted at root has a cyclic dependence among its affine memory access...
mlir::Block * findInnermostCommonBlockInScope(mlir::Operation *a, mlir::Operation *b)
Find the innermost common Block of a and b in the affine scope that a and b are part of.
FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth, ComputationSliceState *srcSlice, FusionStrategy fusionStrategy=FusionStrategy::Generic)
Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the loop nest rooted at 'dst...
LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, ArrayRef< Value > extraIndices={}, AffineMap indexRemap=AffineMap(), ArrayRef< Value > extraOperands={}, ArrayRef< Value > symbolOperands={}, Operation *domOpFilter=nullptr, Operation *postDomOpFilter=nullptr, bool allowNonDereferencingOps=false, bool replaceInDeallocOp=false)
Replaces all "dereferencing" uses of oldMemRef with newMemRef while optionally remapping the old memr...
std::optional< int64_t > getMemRefIntOrFloatEltSizeInBytes(MemRefType memRefType)
Returns the memref's element type's size in bytes where the elemental type is an int or float or a ve...
Include the generated interface declarations.
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
AffineExpr simplifyAffineExpr(AffineExpr expr, unsigned numDims, unsigned numSymbols)
Simplify an affine expression by flattening and some amount of simple analysis.
AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)
These free functions allow clients of the API to not use classes in detail.
ComputationSliceState aggregates loop IVs, loop bound AffineMaps and their associated operands for a ...
bool isEmpty() const
Returns true if the computation slice is empty.
std::optional< bool > isMaximal() const
Returns true if the computation slice encloses all the iterations of the sliced loop nest.
Block::iterator insertPoint
enum mlir::affine::FusionResult::ResultEnum value
SmallVector< Operation *, 4 > memrefFrees
SmallVector< Operation *, 4 > loadOpInsts
SmallVector< Operation *, 4 > memrefStores
void collect(Operation *opToWalk)
SmallVector< Operation *, 4 > memrefLoads
SmallVector< Operation *, 4 > storeOpInsts
LoopNestStats aggregates various per-loop statistics (eg.
DenseMap< unsigned, SmallVector< Edge, 2 > > outEdges
Block & block
The block for which this graph is created to perform fusion.
unsigned addNode(Operation *op)
void addEdge(unsigned srcId, unsigned dstId, Value value)
DenseMap< unsigned, Node > nodes
bool hasDependencePath(unsigned srcId, unsigned dstId) const
void clearNodeLoadAndStores(unsigned id)
const Node * getForOpNode(AffineForOp forOp) const
Operation * getFusedLoopNestInsertionPoint(unsigned srcId, unsigned dstId) const
void updateEdges(unsigned srcId, unsigned dstId, const DenseSet< Value > &privateMemRefs, bool removeSrcId)
DenseMap< unsigned, SmallVector< Edge, 2 > > inEdges
void forEachMemRefInputEdge(unsigned id, const std::function< void(Edge)> &callback)
unsigned getOutEdgeCount(unsigned id, Value memref=nullptr) const
const Node * getNode(unsigned id) const
void removeNode(unsigned id)
void forEachMemRefOutputEdge(unsigned id, const std::function< void(Edge)> &callback)
void addToNode(unsigned id, ArrayRef< Operation * > loads, ArrayRef< Operation * > stores, ArrayRef< Operation * > memrefLoads, ArrayRef< Operation * > memrefStores, ArrayRef< Operation * > memrefFrees)
unsigned getIncomingMemRefAccesses(unsigned id, Value memref) const
DenseMap< Value, unsigned > memrefEdgeCount
A region of a memref's data space; this is typically constructed by analyzing load/store op's on this...
std::optional< int64_t > getConstantBoundingSizeAndShape(SmallVectorImpl< int64_t > *shape=nullptr, SmallVectorImpl< AffineMap > *lbs=nullptr) const
Returns a constant upper bound on the number of elements in this region if bounded by a known constan...
FlatAffineValueConstraints * getConstraints()
LogicalResult compute(Operation *op, unsigned loopDepth, const ComputationSliceState *sliceState=nullptr, bool addMemRefDimBounds=true, bool dropLocalVars=true, bool dropOuterIVs=true)
Computes the memory region accessed by this memref with the region represented as constraints symboli...
std::optional< int64_t > getRegionSize()
Returns the size of this MemRefRegion in bytes.
Eliminates variable at the specified position using Fourier-Motzkin variable elimination.