MLIR: lib/Dialect/Affine/Transforms/SuperVectorize.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
15
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/Support/Debug.h"
31 #include
32
33 namespace mlir {
34 namespace affine {
35 #define GEN_PASS_DEF_AFFINEVECTORIZE
36 #include "mlir/Dialect/Affine/Passes.h.inc"
37 }
38 }
39
40 using namespace mlir;
41 using namespace affine;
42 using namespace vector;
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575 #define DEBUG_TYPE "early-vect"
576
577 using llvm::dbgs;
578
579
582 int fastestVaryingMemRefDimension);
583
584
585
586
587
588 static std::optional
592 int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
593 int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
594 int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
595 switch (vectorRank) {
596 case 1:
598 case 2:
601 case 3:
605 default: {
606 return std::nullopt;
607 }
608 }
609 }
610
613 llvm::IsaPred<vector::TransferReadOp, vector::TransferWriteOp>);
614 return pattern;
615 }
616
617 namespace {
618
619
620
621 struct Vectorize : public affine::impl::AffineVectorizeBase {
622 using Base::Base;
623
624 void runOnOperation() override;
625 };
626
627 }
628
630 unsigned patternDepth,
632 assert(patternDepth > depthInPattern &&
633 "patternDepth is greater than depthInPattern");
634 if (patternDepth - depthInPattern > strategy->vectorSizes.size()) {
635
636 return;
637 }
639 strategy->vectorSizes.size() - (patternDepth - depthInPattern);
640 }
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
658 unsigned depthInPattern,
659 unsigned patternDepth,
661 for (auto m : matches) {
663 patternDepth, strategy))) {
664 return failure();
665 }
667 patternDepth, strategy);
668 }
669 return success();
670 }
671
672
673
674 namespace {
675
677
679
680
681
682
683
684
685
686
687
688
689 void registerOpVectorReplacement(Operation *replaced, Operation *replacement);
690
691
692
693
694
695
696
697
698
699
700
701
702 void registerValueVectorReplacement(Value replaced, Operation *replacement);
703
704
705
706
707
708
709 void registerBlockArgVectorReplacement(BlockArgument replaced,
711
712
713
714
715
716
717
718
719
720
721
722 void registerValueScalarReplacement(Value replaced, Value replacement);
723
724
725
726
727
728
729
730
731
732
733
734 void registerLoopResultScalarReplacement(Value replaced, Value replacement);
735
736
737
738 void getScalarValueReplacementsFor(ValueRange inputVals,
740
741
742 void finishVectorizationPattern(AffineForOp rootLoop);
743
744
745
747
748
750
751 IRMapping valueVectorReplacement;
752
753
754 IRMapping valueScalarReplacement;
755
757
758
760
761
762
764
765
767
768 private:
769
770
771 void registerValueVectorReplacementImpl(Value replaced, Value replacement);
772 };
773
774 }
775
776
777
778
779
780
781
782
783
784
785 void VectorizationState::registerOpVectorReplacement(Operation *replaced,
787 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op:\n");
788 LLVM_DEBUG(dbgs() << *replaced << "\n");
789 LLVM_DEBUG(dbgs() << "into\n");
790 LLVM_DEBUG(dbgs() << *replacement << "\n");
791
793 "Unexpected replaced and replacement results");
794 assert(opVectorReplacement.count(replaced) == 0 && "already registered");
795 opVectorReplacement[replaced] = replacement;
796
797 for (auto resultTuple :
799 registerValueVectorReplacementImpl(std::get<0>(resultTuple),
800 std::get<1>(resultTuple));
801 }
802
803
804
805
806
807
808
809
810
811
812
813 void VectorizationState::registerValueVectorReplacement(
816 "Expected single-result replacement");
818 registerOpVectorReplacement(defOp, replacement);
819 else
820 registerValueVectorReplacementImpl(replaced, replacement->getResult(0));
821 }
822
823
824
825
826
827
828 void VectorizationState::registerBlockArgVectorReplacement(
830 registerValueVectorReplacementImpl(replaced, replacement);
831 }
832
833 void VectorizationState::registerValueVectorReplacementImpl(Value replaced,
834 Value replacement) {
835 assert(!valueVectorReplacement.contains(replaced) &&
836 "Vector replacement already registered");
837 assert(isa(replacement.getType()) &&
838 "Expected vector type in vector replacement");
839 valueVectorReplacement.map(replaced, replacement);
840 }
841
842
843
844
845
846
847
848
849
850
851
852 void VectorizationState::registerValueScalarReplacement(Value replaced,
853 Value replacement) {
854 assert(!valueScalarReplacement.contains(replaced) &&
855 "Scalar value replacement already registered");
856 assert(!isa(replacement.getType()) &&
857 "Expected scalar type in scalar replacement");
858 valueScalarReplacement.map(replaced, replacement);
859 }
860
861
862
863
864
865
866
867
868
869
870 void VectorizationState::registerLoopResultScalarReplacement(
872 assert(isa(replaced.getDefiningOp()));
873 assert(loopResultScalarReplacement.count(replaced) == 0 &&
874 "already registered");
875 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ will replace a result of the loop "
876 "with scalar: "
877 << replacement);
878 loopResultScalarReplacement[replaced] = replacement;
879 }
880
881
882 void VectorizationState::getScalarValueReplacementsFor(
884 for (Value inputVal : inputVals)
885 replacedVals.push_back(valueScalarReplacement.lookupOrDefault(inputVal));
886 }
887
888
890 LLVM_DEBUG(dbgs() << "[early-vect]+++++ erasing:\n" << forOp << "\n");
891 forOp.erase();
892 }
893
894
895 void VectorizationState::finishVectorizationPattern(AffineForOp rootLoop) {
896 LLVM_DEBUG(dbgs() << "\n[early-vect] Finalizing vectorization\n");
898 }
899
900
905 for (auto resultExpr : map.getResults()) {
906 auto singleResMap =
908 auto afOp = state.builder.create(op->getLoc(), singleResMap,
909 mapOperands);
910 results.push_back(afOp);
911 }
912 }
913
914
915
916
919 int fastestVaryingMemRefDimension) {
920 return [¶llelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
921 auto loop = cast(forOp);
922 if (!parallelLoops.contains(loop))
923 return false;
924 int memRefDim = -1;
925 auto vectorizableBody =
927 if (!vectorizableBody)
928 return false;
929 return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
930 memRefDim == fastestVaryingMemRefDimension;
931 };
932 }
933
934
935
938 assert(!isa(scalarTy) && "Expected scalar type");
940 }
941
942
943
944
947 Type scalarTy = constOp.getType();
948 if (!VectorType::isValidElementType(scalarTy))
949 return nullptr;
950
951 auto vecTy = getVectorType(scalarTy, state.strategy);
953
955 Operation *parentOp = state.builder.getInsertionBlock()->getParentOp();
956
957 while (parentOp && !state.vecLoopToVecDim.count(parentOp))
959 assert(parentOp && state.vecLoopToVecDim.count(parentOp) &&
960 isa(parentOp) && "Expected a vectorized for op");
961 auto vecForOp = cast(parentOp);
962 state.builder.setInsertionPointToStart(vecForOp.getBody());
963 auto newConstOp =
964 state.builder.createarith::ConstantOp(constOp.getLoc(), vecAttr);
965
966
967 state.registerOpVectorReplacement(constOp, newConstOp);
968 return newConstOp;
969 }
970
971
972
976 for (Value operand : applyOp.getOperands()) {
977 if (state.valueVectorReplacement.contains(operand)) {
978 LLVM_DEBUG(
979 dbgs() << "\n[early-vect]+++++ affine.apply on vector operand\n");
980 return nullptr;
981 } else {
982 Value updatedOperand = state.valueScalarReplacement.lookupOrNull(operand);
983 if (!updatedOperand)
984 updatedOperand = operand;
985 updatedOperands.push_back(updatedOperand);
986 }
987 }
988
989 auto newApplyOp = state.builder.create(
990 applyOp.getLoc(), applyOp.getAffineMap(), updatedOperands);
991
992
993 state.registerValueScalarReplacement(applyOp.getResult(),
994 newApplyOp.getResult());
995 return newApplyOp;
996 }
997
998
999
1000
1002 Value oldOperand,
1005 if (!VectorType::isValidElementType(scalarTy))
1006 return nullptr;
1007
1009 reductionKind, scalarTy, state.builder, oldOperand.getLoc());
1010 auto vecTy = getVectorType(scalarTy, state.strategy);
1012 auto newConstOp =
1013 state.builder.createarith::ConstantOp(oldOperand.getLoc(), vecAttr);
1014
1015 return newConstOp;
1016 }
1017
1018
1019
1020
1021
1022
1023
1024
1026 assert(state.strategy->vectorSizes.size() == 1 &&
1027 "Creating a mask non-1-D vectors is not supported.");
1028 assert(vecForOp.getStep() == state.strategy->vectorSizes[0] &&
1029 "Creating a mask for loops with non-unit original step size is not "
1030 "supported.");
1031
1032
1033 if (Value mask = state.vecLoopToMask.lookup(vecForOp))
1034 return mask;
1035
1036
1037
1038 if (vecForOp.hasConstantBounds()) {
1039 int64_t originalTripCount =
1040 vecForOp.getConstantUpperBound() - vecForOp.getConstantLowerBound();
1041 if (originalTripCount % vecForOp.getStepAsInt() == 0)
1042 return nullptr;
1043 }
1044
1046 state.builder.setInsertionPointToStart(vecForOp.getBody());
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058 Location loc = vecForOp.getLoc();
1059
1060
1061
1062 AffineMap ubMap = vecForOp.getUpperBoundMap();
1065 ub = state.builder.create(loc, vecForOp.getUpperBoundMap(),
1066 vecForOp.getUpperBoundOperands());
1067 else
1068 ub = state.builder.create(loc, vecForOp.getUpperBoundMap(),
1069 vecForOp.getUpperBoundOperands());
1070
1072 state.builder.getAffineDimExpr(0) - state.builder.getAffineDimExpr(1);
1073 Value itersLeft =
1075 {ub, vecForOp.getInductionVar()});
1076
1079
1081 state.builder.getIntegerType(1));
1083 state.builder.createvector::CreateMaskOp(loc, maskTy, itersLeft);
1084
1085 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a mask:\n"
1086 << itersLeft << "\n"
1087 << mask << "\n");
1088
1089 state.vecLoopToMask[vecForOp] = mask;
1090 return mask;
1091 }
1092
1093
1094
1095
1096
1097
1101 if (forOp && strategy->loopToVectorDim.count(forOp) == 0)
1102 return true;
1103
1105 auto loop = cast(loopToDim.first);
1106 if (!loop.isDefinedOutsideOfLoop(value))
1107 return false;
1108 }
1109
1111 return false;
1112
1113 return true;
1114 }
1115
1116
1117
1121 Value uniformScalarRepl =
1122 state.valueScalarReplacement.lookupOrDefault(uniformVal);
1123 state.builder.setInsertionPointAfterValue(uniformScalarRepl);
1124
1126 auto bcastOp = state.builder.create(uniformVal.getLoc(),
1127 vectorTy, uniformScalarRepl);
1128 state.registerValueVectorReplacement(uniformVal, bcastOp);
1129 return bcastOp;
1130 }
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1150 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorize operand: " << operand);
1151
1152 if (Value vecRepl = state.valueVectorReplacement.lookupOrNull(operand)) {
1153 LLVM_DEBUG(dbgs() << " -> already vectorized: " << vecRepl);
1154 return vecRepl;
1155 }
1156
1157
1158
1159
1160 assert(!isa(operand.getType()) &&
1161 "Vector op not found in replacement map");
1162
1163
1164 if (auto constOp = operand.getDefiningOparith::ConstantOp()) {
1166 LLVM_DEBUG(dbgs() << "-> constant: " << vecConstant);
1167 return vecConstant.getResult();
1168 }
1169
1170
1173 LLVM_DEBUG(dbgs() << "-> uniform: " << *vecUniform);
1174 return vecUniform->getResult(0);
1175 }
1176
1177
1178
1180 LLVM_DEBUG(dbgs() << "-> unsupported block argument\n");
1181 else
1182
1183 LLVM_DEBUG(dbgs() << "-> non-vectorizable\n");
1184
1185 return nullptr;
1186 }
1187
1188
1192 for (auto &kvp : loopToVectorDim) {
1193 AffineForOp forOp = cast(kvp.first);
1194
1197
1198 unsigned nonInvariant = 0;
1199 for (Value idx : indices) {
1200 if (invariants.count(idx))
1201 continue;
1202
1203 if (++nonInvariant > 1) {
1204 LLVM_DEBUG(dbgs() << "[early‑vect] Bail out: IV "
1205 << forOp.getInductionVar() << " drives "
1206 << nonInvariant << " indices\n");
1207 return true;
1208 }
1209 }
1210 }
1211 return false;
1212 }
1213
1214
1215
1216
1217
1218
1219
1222 MemRefType memRefType = loadOp.getMemRefType();
1223 Type elementType = memRefType.getElementType();
1224 auto vectorType = VectorType::get(state.strategy->vectorSizes, elementType);
1225
1226
1228 state.getScalarValueReplacementsFor(loadOp.getMapOperands(), mapOperands);
1229
1230
1232 indices.reserve(memRefType.getRank());
1233 if (loadOp.getAffineMap() !=
1234 state.builder.getMultiDimIdentityMap(memRefType.getRank())) {
1235
1236 for (auto op : mapOperands) {
1237 if (op.getDefiningOp())
1238 return nullptr;
1239 }
1241 indices);
1242 } else {
1243 indices.append(mapOperands.begin(), mapOperands.end());
1244 }
1245
1247 return nullptr;
1248
1249
1250 auto permutationMap = makePermutationMap(state.builder.getInsertionBlock(),
1251 indices, state.vecLoopToVecDim);
1252 if (!permutationMap) {
1253 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ can't compute permutationMap\n");
1254 return nullptr;
1255 }
1256 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
1257 LLVM_DEBUG(permutationMap.print(dbgs()));
1258
1259 auto transfer = state.builder.createvector::TransferReadOp(
1260 loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap);
1261
1262
1263 state.registerOpVectorReplacement(loadOp, transfer);
1264 return transfer;
1265 }
1266
1267
1268
1269
1270
1271
1272
1275 MemRefType memRefType = storeOp.getMemRefType();
1277 if (!vectorValue)
1278 return nullptr;
1279
1280
1282 state.getScalarValueReplacementsFor(storeOp.getMapOperands(), mapOperands);
1283
1284
1286 indices.reserve(memRefType.getRank());
1287 if (storeOp.getAffineMap() !=
1288 state.builder.getMultiDimIdentityMap(memRefType.getRank()))
1290 indices);
1291 else
1292 indices.append(mapOperands.begin(), mapOperands.end());
1293
1295 return nullptr;
1296
1297
1298 auto permutationMap = makePermutationMap(state.builder.getInsertionBlock(),
1299 indices, state.vecLoopToVecDim);
1300 if (!permutationMap)
1301 return nullptr;
1302 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
1303 LLVM_DEBUG(permutationMap.print(dbgs()));
1304
1305 auto transfer = state.builder.createvector::TransferWriteOp(
1306 storeOp.getLoc(), vectorValue, storeOp.getMemRef(), indices,
1307 permutationMap);
1308 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer);
1309
1310
1311 state.registerOpVectorReplacement(storeOp, transfer);
1312 return transfer;
1313 }
1314
1315
1316
1320 if (!VectorType::isValidElementType(scalarTy))
1321 return false;
1323 state.builder, value.getLoc());
1324 if (auto constOp = dyn_cast_or_nullarith::ConstantOp(value.getDefiningOp()))
1325 return constOp.getValue() == valueAttr;
1326 return false;
1327 }
1328
1329
1330
1331
1332
1333
1334
1338 auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);
1339 bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();
1340
1341
1342 if (isLoopVecDim && forOp.getNumIterOperands() > 0 && forOp.getStep() != 1) {
1343 LLVM_DEBUG(
1344 dbgs()
1345 << "\n[early-vect]+++++ unsupported step size for reduction loop: "
1346 << forOp.getStep() << "\n");
1347 return nullptr;
1348 }
1349
1350
1351
1352
1353 unsigned newStep;
1354 if (isLoopVecDim) {
1355 unsigned vectorDim = loopToVecDimIt->second;
1356 assert(vectorDim < strategy.vectorSizes.size() && "vector dim overflow");
1357 int64_t forOpVecFactor = strategy.vectorSizes[vectorDim];
1358 newStep = forOp.getStepAsInt() * forOpVecFactor;
1359 } else {
1360 newStep = forOp.getStepAsInt();
1361 }
1362
1363
1365 if (isLoopVecDim && forOp.getNumIterOperands() > 0) {
1368 "Reduction descriptors not found when vectorizing a reduction loop");
1369 reductions = it->second;
1370 assert(reductions.size() == forOp.getNumIterOperands() &&
1371 "The size of reductions array must match the number of iter_args");
1372 }
1373
1374
1376 if (!isLoopVecDim) {
1377 for (auto operand : forOp.getInits())
1378 vecIterOperands.push_back(vectorizeOperand(operand, state));
1379 } else {
1380
1381
1382
1383 for (auto redAndOperand : llvm::zip(reductions, forOp.getInits())) {
1385 std::get<0>(redAndOperand).kind, std::get<1>(redAndOperand), state));
1386 }
1387 }
1388
1389 auto vecForOp = state.builder.create(
1390 forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),
1391 forOp.getUpperBoundOperands(), forOp.getUpperBoundMap(), newStep,
1392 vecIterOperands,
1394
1395
1396 });
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411 state.registerOpVectorReplacement(forOp, vecForOp);
1412 state.registerValueScalarReplacement(forOp.getInductionVar(),
1413 vecForOp.getInductionVar());
1414 for (auto iterTuple :
1415 llvm ::zip(forOp.getRegionIterArgs(), vecForOp.getRegionIterArgs()))
1416 state.registerBlockArgVectorReplacement(std::get<0>(iterTuple),
1417 std::get<1>(iterTuple));
1418
1419 if (isLoopVecDim) {
1420 for (unsigned i = 0; i < vecForOp.getNumIterOperands(); ++i) {
1421
1422 Value reducedRes =
1424 vecForOp.getLoc(), vecForOp.getResult(i));
1425 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a vector reduction: "
1426 << reducedRes);
1427
1428
1429 Value origInit = forOp.getOperand(forOp.getNumControlOperands() + i);
1430 Value finalRes = reducedRes;
1432 finalRes =
1434 reducedRes.getLoc(), reducedRes, origInit);
1435 state.registerLoopResultScalarReplacement(forOp.getResult(i), finalRes);
1436 }
1437 }
1438
1439 if (isLoopVecDim)
1440 state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;
1441
1442
1443
1444 state.builder.setInsertionPointToStart(vecForOp.getBody());
1445
1446
1447
1448 if (isLoopVecDim && forOp.getNumIterOperands() > 0)
1450
1451 return vecForOp;
1452 }
1453
1454
1455
1456
1460 vectorTypes.push_back(
1461 VectorType::get(state.strategy->vectorSizes, result.getType()));
1462
1466 if (!vecOperand) {
1467 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize\n");
1468 return nullptr;
1469 }
1470 vectorOperands.push_back(vecOperand);
1471 }
1472
1473
1474
1475
1476
1477
1480 vectorOperands, vectorTypes, op->getAttrs());
1481 state.registerOpVectorReplacement(op, vecOp);
1482 return vecOp;
1483 }
1484
1485
1486
1487
1488
1492 Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503 if (Value mask = state.vecLoopToMask.lookup(newParentOp)) {
1504 state.builder.setInsertionPoint(newYieldOp);
1505 for (unsigned i = 0; i < newYieldOp->getNumOperands(); ++i) {
1508 cast(newParentOp).getRegionIterArgs(), i, combinerOps);
1509 assert(reducedVal && "expect non-null value for parallel reduction loop");
1510 assert(combinerOps.size() == 1 && "expect only one combiner op");
1511
1512 Value neutralVal = cast(newParentOp).getInits()[i];
1513 state.builder.setInsertionPoint(combinerOps.back());
1514 Value maskedReducedVal = state.builder.createarith::SelectOp(
1515 reducedVal.getLoc(), mask, reducedVal, neutralVal);
1516 LLVM_DEBUG(
1517 dbgs() << "\n[early-vect]+++++ masking an input to a binary op that"
1518 "produces value for a yield Op: "
1519 << maskedReducedVal);
1520 combinerOps.back()->replaceUsesOfWith(reducedVal, maskedReducedVal);
1521 }
1522 }
1523
1524 state.builder.setInsertionPointAfter(newParentOp);
1525 return newYieldOp;
1526 }
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1539
1540 assert(!isavector::TransferReadOp(op) &&
1541 "vector.transfer_read cannot be further vectorized");
1542 assert(!isavector::TransferWriteOp(op) &&
1543 "vector.transfer_write cannot be further vectorized");
1544
1545 if (auto loadOp = dyn_cast(op))
1547 if (auto storeOp = dyn_cast(op))
1549 if (auto forOp = dyn_cast(op))
1551 if (auto yieldOp = dyn_cast(op))
1553 if (auto constant = dyn_castarith::ConstantOp(op))
1555 if (auto applyOp = dyn_cast(op))
1557
1558
1560 return nullptr;
1561
1562 return widenOp(op, state);
1563 }
1564
1565
1566
1567
1568
1569 static void
1572
1573 assert(currentLevel <= loops.size() && "Unexpected currentLevel");
1574 if (currentLevel == loops.size())
1575 loops.emplace_back();
1576
1577
1578 loops[currentLevel].push_back(cast(match.getMatchedOperation()));
1581 }
1582 }
1583
1584
1585
1586
1587
1588
1589 static void
1593 }
1594
1595
1596
1597 static LogicalResult
1600 assert(loops[0].size() == 1 && "Expected single root loop");
1601 AffineForOp rootLoop = loops[0][0];
1603 state.builder.setInsertionPointAfter(rootLoop);
1604 state.strategy = &strategy;
1605
1606
1607
1608
1609
1610
1611
1612
1614 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
1615 return failure();
1616 }
1617
1618
1619
1620
1621
1622
1623
1624
1625
1627 LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);
1629 if (!vectorOp) {
1630 LLVM_DEBUG(
1631 dbgs() << "[early-vect]+++++ failed vectorizing the operation: "
1632 << *op << "\n");
1634 }
1635
1637 });
1638
1639 if (opVecResult.wasInterrupted()) {
1640 LLVM_DEBUG(dbgs() << "[early-vect]+++++ failed vectorization for: "
1641 << rootLoop << "\n");
1642
1643 auto vecRootLoopIt = state.opVectorReplacement.find(rootLoop);
1644 if (vecRootLoopIt != state.opVectorReplacement.end())
1645 eraseLoopNest(cast(vecRootLoopIt->second));
1646
1647 return failure();
1648 }
1649
1650
1651
1652 for (auto resPair : state.loopResultScalarReplacement)
1653 resPair.first.replaceAllUsesWith(resPair.second);
1654
1655 assert(state.opVectorReplacement.count(rootLoop) == 1 &&
1656 "Expected vector replacement for loop nest");
1657 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
1658 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorization result:\n"
1659 << *state.opVectorReplacement[rootLoop]);
1660
1661
1662 state.finishVectorizationPattern(rootLoop);
1663 return success();
1664 }
1665
1666
1667
1668
1671 std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize;
1674 }
1675
1676
1677
1678
1679
1683 assert(intersectionBuckets.empty() && "Expected empty output");
1684
1686
1687 for (const NestedMatch &match : matches) {
1688 AffineForOp matchRoot = cast(match.getMatchedOperation());
1689 bool intersects = false;
1690 for (int i = 0, end = intersectionBuckets.size(); i < end; ++i) {
1691 AffineForOp bucketRoot = bucketRoots[i];
1692
1693 if (bucketRoot->isAncestor(matchRoot)) {
1694 intersectionBuckets[i].push_back(match);
1695 intersects = true;
1696 break;
1697 }
1698
1699
1700 if (matchRoot->isAncestor(bucketRoot)) {
1701 bucketRoots[i] = matchRoot;
1702 intersectionBuckets[i].push_back(match);
1703 intersects = true;
1704 break;
1705 }
1706 }
1707
1708
1709
1710 if (!intersects) {
1711 bucketRoots.push_back(matchRoot);
1712 intersectionBuckets.emplace_back();
1713 intersectionBuckets.back().push_back(match);
1714 }
1715 }
1716 }
1717
1718
1719
1720
1721
1722
1723
1728 assert((reductionLoops.empty() || vectorSizes.size() == 1) &&
1729 "Vectorizing reductions is supported only for 1-D vectors");
1730
1731
1732 std::optional pattern =
1733 makePattern(loops, vectorSizes.size(), fastestVaryingPattern);
1734 if (!pattern) {
1735 LLVM_DEBUG(dbgs() << "\n[early-vect] pattern couldn't be computed\n");
1736 return;
1737 }
1738
1739 LLVM_DEBUG(dbgs() << "\n******************************************");
1740 LLVM_DEBUG(dbgs() << "\n******************************************");
1741 LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on parent op\n");
1742 LLVM_DEBUG(dbgs() << *parentOp << "\n");
1743
1744 unsigned patternDepth = pattern->getDepth();
1745
1746
1747
1749 pattern->match(parentOp, &allMatches);
1750 std::vector<SmallVector<NestedMatch, 8>> intersectionBuckets;
1752
1753
1754
1755
1756 for (auto &intersectingMatches : intersectionBuckets) {
1757 for (NestedMatch &match : intersectingMatches) {
1759
1760 strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
1763 patternDepth, &strategy))) {
1764 continue;
1765 }
1767 &strategy);
1768
1769
1770
1771
1773 break;
1774 }
1775 }
1776
1777 LLVM_DEBUG(dbgs() << "\n");
1778 }
1779
1780
1781
1782 void Vectorize::runOnOperation() {
1783 func::FuncOp f = getOperation();
1784 if (!fastestVaryingPattern.empty() &&
1785 fastestVaryingPattern.size() != vectorSizes.size()) {
1786 f.emitRemark("Fastest varying pattern specified with different size than "
1787 "the vector size.");
1788 return signalPassFailure();
1789 }
1790
1791 if (vectorizeReductions && vectorSizes.size() != 1) {
1792 f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
1793 return signalPassFailure();
1794 }
1795
1796 if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
1797 f.emitError("Vectorization factor must be greater than zero.");
1798 return signalPassFailure();
1799 }
1800
1803
1804
1805
1806 if (vectorizeReductions) {
1807 f.walk([¶llelLoops, &reductionLoops](AffineForOp loop) {
1810 parallelLoops.insert(loop);
1811
1812 if (!reductions.empty())
1813 reductionLoops[loop] = reductions;
1814 }
1815 });
1816 } else {
1817 f.walk([¶llelLoops](AffineForOp loop) {
1819 parallelLoops.insert(loop);
1820 });
1821 }
1822
1823
1825 vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
1826 reductionLoops);
1827 }
1828
1829
1830
1831
1832
1833
1834
1835 static LogicalResult
1837
1838 if (loops.empty())
1839 return failure();
1840
1841
1842 if (loops[0].size() != 1)
1843 return failure();
1844
1845
1846 for (int i = 1, end = loops.size(); i < end; ++i) {
1847 for (AffineForOp loop : loops[i]) {
1848
1849
1850 if (none_of(loops[i - 1], [&](AffineForOp maybeParent) {
1851 return maybeParent->isProperAncestor(loop);
1852 }))
1853 return failure();
1854
1855
1856
1857 for (AffineForOp sibling : loops[i]) {
1858 if (sibling->isProperAncestor(loop))
1859 return failure();
1860 }
1861 }
1862 }
1863
1864 return success();
1865 }
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1880
1882 vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern,
1883 reductionLoops);
1884 }
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1924
1927 return failure();
1929 }
union mlir::linalg::@1203::ArityGroupAndKind::Kind kind
static Operation * vectorizeAffineStore(AffineStoreOp storeOp, VectorizationState &state)
Vectorizes an affine store with the vectorization strategy in 'state' by generating a 'vector....
static Operation * vectorizeAffineForOp(AffineForOp forOp, VectorizationState &state)
Vectorizes a loop with the vectorization strategy in 'state'.
static LogicalResult vectorizeRootMatch(NestedMatch m, const VectorizationStrategy &strategy)
Extracts the matched loops and vectorizes them following a topological order.
static LogicalResult verifyLoopNesting(const std::vector< SmallVector< AffineForOp, 2 >> &loops)
Verify that affine loops in 'loops' meet the nesting criteria expected by SuperVectorizer:
static void getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel, std::vector< SmallVector< AffineForOp, 2 >> &loops)
Recursive implementation to convert all the nested loops in 'match' to a 2D vector container that pre...
static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern, unsigned patternDepth, VectorizationStrategy *strategy)
static Operation * vectorizeOneOperation(Operation *op, VectorizationState &state)
Encodes Operation-specific behavior for vectorization.
static bool isNeutralElementConst(arith::AtomicRMWKind reductionKind, Value value, VectorizationState &state)
Returns true if value is a constant equal to the neutral element of the given vectorizable reduction.
static Operation * vectorizeUniform(Value uniformVal, VectorizationState &state)
Generates a broadcast op for the provided uniform value using the vectorization strategy in 'state'.
static Operation * vectorizeAffineYieldOp(AffineYieldOp yieldOp, VectorizationState &state)
Vectorizes a yield operation by widening its types.
static void computeIntersectionBuckets(ArrayRef< NestedMatch > matches, std::vector< SmallVector< NestedMatch, 8 >> &intersectionBuckets)
Traverses all the loop matches and classifies them into intersection buckets.
static LogicalResult analyzeProfitability(ArrayRef< NestedMatch > matches, unsigned depthInPattern, unsigned patternDepth, VectorizationStrategy *strategy)
Implements a simple strawman strategy for vectorization.
static FilterFunctionType isVectorizableLoopPtrFactory(const DenseSet< Operation * > ¶llelLoops, int fastestVaryingMemRefDimension)
Forward declaration.
static Operation * widenOp(Operation *op, VectorizationState &state)
Vectorizes arbitrary operation by plain widening.
static bool isIVMappedToMultipleIndices(ArrayRef< Value > indices, const DenseMap< Operation *, unsigned > &loopToVectorDim)
Returns true if any vectorized loop IV drives more than one index.
static arith::ConstantOp vectorizeConstant(arith::ConstantOp constOp, VectorizationState &state)
Tries to transform a scalar constant into a vector constant.
static bool isUniformDefinition(Value value, const VectorizationStrategy *strategy)
Returns true if the provided value is vector uniform given the vectorization strategy.
static void eraseLoopNest(AffineForOp forOp)
Erases a loop nest, including all its nested operations.
static VectorType getVectorType(Type scalarTy, const VectorizationStrategy *strategy)
Returns the vector type resulting from applying the provided vectorization strategy on the scalar typ...
static void getMatchedAffineLoops(NestedMatch match, std::vector< SmallVector< AffineForOp, 2 >> &loops)
Converts all the nested loops in 'match' to a 2D vector container that preserves the relative nesting...
static Value vectorizeOperand(Value operand, VectorizationState &state)
Tries to vectorize a given operand by applying the following logic:
static arith::ConstantOp createInitialVector(arith::AtomicRMWKind reductionKind, Value oldOperand, VectorizationState &state)
Creates a constant vector filled with the neutral elements of the given reduction.
static LogicalResult vectorizeLoopNest(std::vector< SmallVector< AffineForOp, 2 >> &loops, const VectorizationStrategy &strategy)
Internal implementation to vectorize affine loops from a single loop nest using an n-D vectorization ...
static NestedPattern & vectorTransferPattern()
static Operation * vectorizeAffineApplyOp(AffineApplyOp applyOp, VectorizationState &state)
We have no need to vectorize affine.apply.
static void vectorizeLoops(Operation *parentOp, DenseSet< Operation * > &loops, ArrayRef< int64_t > vectorSizes, ArrayRef< int64_t > fastestVaryingPattern, const ReductionLoopMap &reductionLoops)
Internal implementation to vectorize affine loops in 'loops' using the n-D vectorization factors in '...
static void computeMemoryOpIndices(Operation *op, AffineMap map, ValueRange mapOperands, VectorizationState &state, SmallVectorImpl< Value > &results)
static Operation * vectorizeAffineLoad(AffineLoadOp loadOp, VectorizationState &state)
Vectorizes an affine load with the vectorization strategy in 'state' by generating a 'vector....
static Value createMask(AffineForOp vecForOp, VectorizationState &state)
Creates a mask used to filter out garbage elements in the last iteration of unaligned loops.
static std::optional< NestedPattern > makePattern(const DenseSet< Operation * > ¶llelLoops, int vectorRank, ArrayRef< int64_t > fastestVaryingPattern)
Creates a vectorization pattern from the command line arguments.
static AffineMap makePermutationMap(ArrayRef< Value > indices, const DenseMap< Operation *, unsigned > &enclosingLoopToVectorDim)
Constructs a permutation map from memref indices to vector dimension.
Base type for affine expression.
A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.
static AffineMap get(MLIRContext *context)
Returns a zero result affine map with no dimensions or symbols: () -> ().
unsigned getNumSymbols() const
unsigned getNumDims() const
ArrayRef< AffineExpr > getResults() const
unsigned getNumResults() const
Attributes are known-constant values of operations.
This class represents an argument of a Block.
static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)
Constructs a dense elements attribute from an array of element values.
This is a utility class for mapping one set of IR entities to another.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
RAII guard to reset the insertion point of the builder when destroyed.
This class helps build Operations.
StringAttr getIdentifier() const
Return the name of this operation as a StringAttr.
Operation is the basic unit of execution within MLIR.
OpResult getResult(unsigned idx)
Get the 'idx'th result of this operation.
unsigned getNumRegions()
Returns the number of regions held by this operation.
Location getLoc()
The source location the operation was defined or derived from.
unsigned getNumOperands()
Operation * getParentOp()
Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...
ArrayRef< NamedAttribute > getAttrs()
Return all of the attributes on this operation.
OperationName getName()
The name of an operation is the key identifier for it.
operand_range getOperands()
Returns an iterator on the underlying Value's.
result_range getResults()
void erase()
Remove this operation from its parent block and delete it.
unsigned getNumResults()
Return the number of results held by this operation.
Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...
bool isIntOrIndexOrFloat() const
Return true if this is an integer (of any signedness), index, or float type.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
bool use_empty() const
Returns true if this value has no uses.
Type getType() const
Return the type of this value.
Location getLoc() const
Return the location of this value.
Operation * getDefiningOp() const
If this value is the result of an operation, return the operation that defines it.
static WalkResult advance()
static WalkResult interrupt()
An NestedPattern captures nested patterns in the IR.
Operation * getMatchedOperation() const
ArrayRef< NestedMatch > getMatchedChildren()
RAII structure to transparently manage the bump allocator for NestedPattern and NestedMatch classes.
NestedPattern For(const NestedPattern &child)
NestedPattern Op(FilterFunctionType filter=defaultFilterFunction)
bool isVectorizableLoopBody(AffineForOp loop, NestedPattern &vectorTransferMatcher)
Checks whether the loop is structurally vectorizable; i.e.
DenseSet< Value, DenseMapInfo< Value > > getInvariantAccesses(Value iv, ArrayRef< Value > indices)
Given an induction variable iv of type AffineForOp and indices of type IndexType, returns the set of ...
AffineForOp getForInductionVarOwner(Value val)
Returns the loop parent of an induction variable.
AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)
Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...
std::function< bool(Operation &)> FilterFunctionType
A NestedPattern is a nested operation walker that:
void vectorizeAffineLoops(Operation *parentOp, llvm::DenseSet< Operation *, DenseMapInfo< Operation * >> &loops, ArrayRef< int64_t > vectorSizes, ArrayRef< int64_t > fastestVaryingPattern, const ReductionLoopMap &reductionLoops=ReductionLoopMap())
Vectorizes affine loops in 'loops' using the n-D vectorization factors in 'vectorSizes'.
bool isLoopParallel(AffineForOp forOp, SmallVectorImpl< LoopReduction > *parallelReductions=nullptr)
Returns true if ‘forOp’ is a parallel loop.
LogicalResult vectorizeAffineLoopNest(std::vector< SmallVector< AffineForOp, 2 >> &loops, const VectorizationStrategy &strategy)
External utility to vectorize affine loops from a single loop nest using an n-D vectorization strateg...
TypedAttr getIdentityValueAttr(AtomicRMWKind kind, Type resultType, OpBuilder &builder, Location loc, bool useOnlyFiniteValue=false)
Returns the identity value attribute associated with an AtomicRMWKind op.
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)
Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...
Value getVectorReductionOp(arith::AtomicRMWKind op, OpBuilder &builder, Location loc, Value vector)
Returns the value obtained by reducing the vector into a scalar using the operation kind associated w...
Include the generated interface declarations.
Value matchReduction(ArrayRef< BlockArgument > iterCarriedArgs, unsigned redPos, SmallVectorImpl< Operation * > &combinerOps)
Utility to match a generic reduction given a list of iteration-carried arguments, iterCarriedArgs and...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
Contains the vectorization state and related methods used across the vectorization process of a given...
Holds parameters to perform n-D vectorization on a single loop nest.
SmallVector< int64_t, 8 > vectorSizes
DenseMap< Operation *, unsigned > loopToVectorDim
ReductionLoopMap reductionLoops