SuperVectorize.cpp Source File (original) (raw)

29 #include "llvm/ADT/STLExtras.h"

30 #include "llvm/Support/Debug.h"

31 #include

33 namespace mlir {

34 namespace affine {

35 #define GEN_PASS_DEF_AFFINEVECTORIZE

36 #include "mlir/Dialect/Affine/Passes.h.inc"

37 }

38 }

40 using namespace mlir;

41 using namespace affine;

42 using namespace vector;

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575 #define DEBUG_TYPE "early-vect"

576

577 using llvm::dbgs;

578

579

582 int fastestVaryingMemRefDimension);

583

584

585

586

587

588 static std::optional

592 int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];

593 int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];

594 int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];

595 switch (vectorRank) {

596 case 1:

598 case 2:

601 case 3:

605 default: {

606 return std::nullopt;

607 }

608 }

609 }

610

613 llvm::IsaPred<vector::TransferReadOp, vector::TransferWriteOp>);

614 return pattern;

615 }

616

617 namespace {

618

619

620

621 struct Vectorize : public affine::impl::AffineVectorizeBase {

622 using Base::Base;

623

624 void runOnOperation() override;

625 };

626

627 }

628

630 unsigned patternDepth,

632 assert(patternDepth > depthInPattern &&

633 "patternDepth is greater than depthInPattern");

634 if (patternDepth - depthInPattern > strategy->vectorSizes.size()) {

635

636 return;

637 }

639 strategy->vectorSizes.size() - (patternDepth - depthInPattern);

640 }

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

658 unsigned depthInPattern,

659 unsigned patternDepth,

661 for (auto m : matches) {

663 patternDepth, strategy))) {

664 return failure();

665 }

667 patternDepth, strategy);

668 }

669 return success();

670 }

671

672

673

674 namespace {

675

677

679

680

681

682

683

684

685

686

687

688

689 void registerOpVectorReplacement(Operation *replaced, Operation *replacement);

690

691

692

693

694

695

696

697

698

699

700

701

702 void registerValueVectorReplacement(Value replaced, Operation *replacement);

703

704

705

706

707

708

709 void registerBlockArgVectorReplacement(BlockArgument replaced,

711

712

713

714

715

716

717

718

719

720

721

722 void registerValueScalarReplacement(Value replaced, Value replacement);

723

724

725

726

727

728

729

730

731

732

733

734 void registerLoopResultScalarReplacement(Value replaced, Value replacement);

735

736

737

738 void getScalarValueReplacementsFor(ValueRange inputVals,

740

741

742 void finishVectorizationPattern(AffineForOp rootLoop);

743

744

745

747

748

750

751 IRMapping valueVectorReplacement;

752

753

754 IRMapping valueScalarReplacement;

755

757

758

760

761

762

764

765

767

768 private:

769

770

771 void registerValueVectorReplacementImpl(Value replaced, Value replacement);

772 };

773

774 }

775

776

777

778

779

780

781

782

783

784

785 void VectorizationState::registerOpVectorReplacement(Operation *replaced,

787 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op:\n");

788 LLVM_DEBUG(dbgs() << *replaced << "\n");

789 LLVM_DEBUG(dbgs() << "into\n");

790 LLVM_DEBUG(dbgs() << *replacement << "\n");

791

793 "Unexpected replaced and replacement results");

794 assert(opVectorReplacement.count(replaced) == 0 && "already registered");

795 opVectorReplacement[replaced] = replacement;

796

797 for (auto resultTuple :

799 registerValueVectorReplacementImpl(std::get<0>(resultTuple),

800 std::get<1>(resultTuple));

801 }

802

803

804

805

806

807

808

809

810

811

812

813 void VectorizationState::registerValueVectorReplacement(

816 "Expected single-result replacement");

818 registerOpVectorReplacement(defOp, replacement);

819 else

820 registerValueVectorReplacementImpl(replaced, replacement->getResult(0));

821 }

822

823

824

825

826

827

828 void VectorizationState::registerBlockArgVectorReplacement(

830 registerValueVectorReplacementImpl(replaced, replacement);

831 }

832

833 void VectorizationState::registerValueVectorReplacementImpl(Value replaced,

834 Value replacement) {

835 assert(!valueVectorReplacement.contains(replaced) &&

836 "Vector replacement already registered");

837 assert(isa(replacement.getType()) &&

838 "Expected vector type in vector replacement");

839 valueVectorReplacement.map(replaced, replacement);

840 }

841

842

843

844

845

846

847

848

849

850

851

852 void VectorizationState::registerValueScalarReplacement(Value replaced,

853 Value replacement) {

854 assert(!valueScalarReplacement.contains(replaced) &&

855 "Scalar value replacement already registered");

856 assert(!isa(replacement.getType()) &&

857 "Expected scalar type in scalar replacement");

858 valueScalarReplacement.map(replaced, replacement);

859 }

860

861

862

863

864

865

866

867

868

869

870 void VectorizationState::registerLoopResultScalarReplacement(

872 assert(isa(replaced.getDefiningOp()));

873 assert(loopResultScalarReplacement.count(replaced) == 0 &&

874 "already registered");

875 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ will replace a result of the loop "

876 "with scalar: "

877 << replacement);

878 loopResultScalarReplacement[replaced] = replacement;

879 }

880

881

882 void VectorizationState::getScalarValueReplacementsFor(

884 for (Value inputVal : inputVals)

885 replacedVals.push_back(valueScalarReplacement.lookupOrDefault(inputVal));

886 }

887

888

890 LLVM_DEBUG(dbgs() << "[early-vect]+++++ erasing:\n" << forOp << "\n");

891 forOp.erase();

892 }

893

894

895 void VectorizationState::finishVectorizationPattern(AffineForOp rootLoop) {

896 LLVM_DEBUG(dbgs() << "\n[early-vect] Finalizing vectorization\n");

898 }

899

900

905 for (auto resultExpr : map.getResults()) {

906 auto singleResMap =

908 auto afOp = state.builder.create(op->getLoc(), singleResMap,

909 mapOperands);

910 results.push_back(afOp);

911 }

912 }

913

914

915

916

919 int fastestVaryingMemRefDimension) {

920 return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {

921 auto loop = cast(forOp);

922 if (!parallelLoops.contains(loop))

923 return false;

924 int memRefDim = -1;

925 auto vectorizableBody =

927 if (!vectorizableBody)

928 return false;

929 return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||

930 memRefDim == fastestVaryingMemRefDimension;

931 };

932 }

933

934

935

938 assert(!isa(scalarTy) && "Expected scalar type");

940 }

941

942

943

944

947 Type scalarTy = constOp.getType();

948 if (!VectorType::isValidElementType(scalarTy))

949 return nullptr;

950

951 auto vecTy = getVectorType(scalarTy, state.strategy);

953

955 Operation *parentOp = state.builder.getInsertionBlock()->getParentOp();

956

957 while (parentOp && !state.vecLoopToVecDim.count(parentOp))

959 assert(parentOp && state.vecLoopToVecDim.count(parentOp) &&

960 isa(parentOp) && "Expected a vectorized for op");

961 auto vecForOp = cast(parentOp);

962 state.builder.setInsertionPointToStart(vecForOp.getBody());

963 auto newConstOp =

964 state.builder.createarith::ConstantOp(constOp.getLoc(), vecAttr);

965

966

967 state.registerOpVectorReplacement(constOp, newConstOp);

968 return newConstOp;

969 }

970

971

972

976 for (Value operand : applyOp.getOperands()) {

977 if (state.valueVectorReplacement.contains(operand)) {

978 LLVM_DEBUG(

979 dbgs() << "\n[early-vect]+++++ affine.apply on vector operand\n");

980 return nullptr;

981 } else {

982 Value updatedOperand = state.valueScalarReplacement.lookupOrNull(operand);

983 if (!updatedOperand)

984 updatedOperand = operand;

985 updatedOperands.push_back(updatedOperand);

986 }

987 }

988

989 auto newApplyOp = state.builder.create(

990 applyOp.getLoc(), applyOp.getAffineMap(), updatedOperands);

991

992

993 state.registerValueScalarReplacement(applyOp.getResult(),

994 newApplyOp.getResult());

995 return newApplyOp;

996 }

997

998

999

1000

1002 Value oldOperand,

1005 if (!VectorType::isValidElementType(scalarTy))

1006 return nullptr;

1007

1009 reductionKind, scalarTy, state.builder, oldOperand.getLoc());

1010 auto vecTy = getVectorType(scalarTy, state.strategy);

1012 auto newConstOp =

1013 state.builder.createarith::ConstantOp(oldOperand.getLoc(), vecAttr);

1014

1015 return newConstOp;

1016 }

1017

1018

1019

1020

1021

1022

1023

1024

1026 assert(state.strategy->vectorSizes.size() == 1 &&

1027 "Creating a mask non-1-D vectors is not supported.");

1028 assert(vecForOp.getStep() == state.strategy->vectorSizes[0] &&

1029 "Creating a mask for loops with non-unit original step size is not "

1030 "supported.");

1031

1032

1033 if (Value mask = state.vecLoopToMask.lookup(vecForOp))

1034 return mask;

1035

1036

1037

1038 if (vecForOp.hasConstantBounds()) {

1039 int64_t originalTripCount =

1040 vecForOp.getConstantUpperBound() - vecForOp.getConstantLowerBound();

1041 if (originalTripCount % vecForOp.getStepAsInt() == 0)

1042 return nullptr;

1043 }

1044

1046 state.builder.setInsertionPointToStart(vecForOp.getBody());

1047

1048

1049

1050

1051

1052

1053

1054

1055

1056

1057

1058 Location loc = vecForOp.getLoc();

1059

1060

1061

1062 AffineMap ubMap = vecForOp.getUpperBoundMap();

1065 ub = state.builder.create(loc, vecForOp.getUpperBoundMap(),

1066 vecForOp.getUpperBoundOperands());

1067 else

1068 ub = state.builder.create(loc, vecForOp.getUpperBoundMap(),

1069 vecForOp.getUpperBoundOperands());

1070

1072 state.builder.getAffineDimExpr(0) - state.builder.getAffineDimExpr(1);

1073 Value itersLeft =

1075 {ub, vecForOp.getInductionVar()});

1076

1079

1081 state.builder.getIntegerType(1));

1083 state.builder.createvector::CreateMaskOp(loc, maskTy, itersLeft);

1084

1085 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a mask:\n"

1086 << itersLeft << "\n"

1087 << mask << "\n");

1088

1089 state.vecLoopToMask[vecForOp] = mask;

1090 return mask;

1091 }

1092

1093

1094

1095

1096

1097

1101 if (forOp && strategy->loopToVectorDim.count(forOp) == 0)

1102 return true;

1103

1105 auto loop = cast(loopToDim.first);

1106 if (!loop.isDefinedOutsideOfLoop(value))

1107 return false;

1108 }

1109

1111 return false;

1112

1113 return true;

1114 }

1115

1116

1117

1121 Value uniformScalarRepl =

1122 state.valueScalarReplacement.lookupOrDefault(uniformVal);

1123 state.builder.setInsertionPointAfterValue(uniformScalarRepl);

1124

1126 auto bcastOp = state.builder.create(uniformVal.getLoc(),

1127 vectorTy, uniformScalarRepl);

1128 state.registerValueVectorReplacement(uniformVal, bcastOp);

1129 return bcastOp;

1130 }

1131

1132

1133

1134

1135

1136

1137

1138

1139

1140

1141

1142

1143

1144

1145

1146

1147

1148

1150 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorize operand: " << operand);

1151

1152 if (Value vecRepl = state.valueVectorReplacement.lookupOrNull(operand)) {

1153 LLVM_DEBUG(dbgs() << " -> already vectorized: " << vecRepl);

1154 return vecRepl;

1155 }

1156

1157

1158

1159

1160 assert(!isa(operand.getType()) &&

1161 "Vector op not found in replacement map");

1162

1163

1164 if (auto constOp = operand.getDefiningOp arith::ConstantOp()) {

1166 LLVM_DEBUG(dbgs() << "-> constant: " << vecConstant);

1167 return vecConstant.getResult();

1168 }

1169

1170

1173 LLVM_DEBUG(dbgs() << "-> uniform: " << *vecUniform);

1174 return vecUniform->getResult(0);

1175 }

1176

1177

1178

1180 LLVM_DEBUG(dbgs() << "-> unsupported block argument\n");

1181 else

1182

1183 LLVM_DEBUG(dbgs() << "-> non-vectorizable\n");

1184

1185 return nullptr;

1186 }

1187

1188

1192 for (auto &kvp : loopToVectorDim) {

1193 AffineForOp forOp = cast(kvp.first);

1194

1197

1198 unsigned nonInvariant = 0;

1199 for (Value idx : indices) {

1200 if (invariants.count(idx))

1201 continue;

1202

1203 if (++nonInvariant > 1) {

1204 LLVM_DEBUG(dbgs() << "[early‑vect] Bail out: IV "

1205 << forOp.getInductionVar() << " drives "

1206 << nonInvariant << " indices\n");

1207 return true;

1208 }

1209 }

1210 }

1211 return false;

1212 }

1213

1214

1215

1216

1217

1218

1219

1222 MemRefType memRefType = loadOp.getMemRefType();

1223 Type elementType = memRefType.getElementType();

1224 auto vectorType = VectorType::get(state.strategy->vectorSizes, elementType);

1225

1226

1228 state.getScalarValueReplacementsFor(loadOp.getMapOperands(), mapOperands);

1229

1230

1232 indices.reserve(memRefType.getRank());

1233 if (loadOp.getAffineMap() !=

1234 state.builder.getMultiDimIdentityMap(memRefType.getRank())) {

1235

1236 for (auto op : mapOperands) {

1237 if (op.getDefiningOp())

1238 return nullptr;

1239 }

1241 indices);

1242 } else {

1243 indices.append(mapOperands.begin(), mapOperands.end());

1244 }

1245

1247 return nullptr;

1248

1249

1250 auto permutationMap = makePermutationMap(state.builder.getInsertionBlock(),

1251 indices, state.vecLoopToVecDim);

1252 if (!permutationMap) {

1253 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ can't compute permutationMap\n");

1254 return nullptr;

1255 }

1256 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");

1257 LLVM_DEBUG(permutationMap.print(dbgs()));

1258

1259 auto transfer = state.builder.createvector::TransferReadOp(

1260 loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap);

1261

1262

1263 state.registerOpVectorReplacement(loadOp, transfer);

1264 return transfer;

1265 }

1266

1267

1268

1269

1270

1271

1272

1275 MemRefType memRefType = storeOp.getMemRefType();

1277 if (!vectorValue)

1278 return nullptr;

1279

1280

1282 state.getScalarValueReplacementsFor(storeOp.getMapOperands(), mapOperands);

1283

1284

1286 indices.reserve(memRefType.getRank());

1287 if (storeOp.getAffineMap() !=

1288 state.builder.getMultiDimIdentityMap(memRefType.getRank()))

1290 indices);

1291 else

1292 indices.append(mapOperands.begin(), mapOperands.end());

1293

1295 return nullptr;

1296

1297

1298 auto permutationMap = makePermutationMap(state.builder.getInsertionBlock(),

1299 indices, state.vecLoopToVecDim);

1300 if (!permutationMap)

1301 return nullptr;

1302 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");

1303 LLVM_DEBUG(permutationMap.print(dbgs()));

1304

1305 auto transfer = state.builder.createvector::TransferWriteOp(

1306 storeOp.getLoc(), vectorValue, storeOp.getMemRef(), indices,

1307 permutationMap);

1308 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer);

1309

1310

1311 state.registerOpVectorReplacement(storeOp, transfer);

1312 return transfer;

1313 }

1314

1315

1316

1320 if (!VectorType::isValidElementType(scalarTy))

1321 return false;

1323 state.builder, value.getLoc());

1324 if (auto constOp = dyn_cast_or_nullarith::ConstantOp(value.getDefiningOp()))

1325 return constOp.getValue() == valueAttr;

1326 return false;

1327 }

1328

1329

1330

1331

1332

1333

1334

1338 auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);

1339 bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();

1340

1341

1342 if (isLoopVecDim && forOp.getNumIterOperands() > 0 && forOp.getStep() != 1) {

1343 LLVM_DEBUG(

1344 dbgs()

1345 << "\n[early-vect]+++++ unsupported step size for reduction loop: "

1346 << forOp.getStep() << "\n");

1347 return nullptr;

1348 }

1349

1350

1351

1352

1353 unsigned newStep;

1354 if (isLoopVecDim) {

1355 unsigned vectorDim = loopToVecDimIt->second;

1356 assert(vectorDim < strategy.vectorSizes.size() && "vector dim overflow");

1357 int64_t forOpVecFactor = strategy.vectorSizes[vectorDim];

1358 newStep = forOp.getStepAsInt() * forOpVecFactor;

1359 } else {

1360 newStep = forOp.getStepAsInt();

1361 }

1362

1363

1365 if (isLoopVecDim && forOp.getNumIterOperands() > 0) {

1368 "Reduction descriptors not found when vectorizing a reduction loop");

1369 reductions = it->second;

1370 assert(reductions.size() == forOp.getNumIterOperands() &&

1371 "The size of reductions array must match the number of iter_args");

1372 }

1373

1374

1376 if (!isLoopVecDim) {

1377 for (auto operand : forOp.getInits())

1378 vecIterOperands.push_back(vectorizeOperand(operand, state));

1379 } else {

1380

1381

1382

1383 for (auto redAndOperand : llvm::zip(reductions, forOp.getInits())) {

1385 std::get<0>(redAndOperand).kind, std::get<1>(redAndOperand), state));

1386 }

1387 }

1388

1389 auto vecForOp = state.builder.create(

1390 forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),

1391 forOp.getUpperBoundOperands(), forOp.getUpperBoundMap(), newStep,

1392 vecIterOperands,

1394

1395

1396 });

1397

1398

1399

1400

1401

1402

1403

1404

1405

1406

1407

1408

1409

1410

1411 state.registerOpVectorReplacement(forOp, vecForOp);

1412 state.registerValueScalarReplacement(forOp.getInductionVar(),

1413 vecForOp.getInductionVar());

1414 for (auto iterTuple :

1415 llvm ::zip(forOp.getRegionIterArgs(), vecForOp.getRegionIterArgs()))

1416 state.registerBlockArgVectorReplacement(std::get<0>(iterTuple),

1417 std::get<1>(iterTuple));

1418

1419 if (isLoopVecDim) {

1420 for (unsigned i = 0; i < vecForOp.getNumIterOperands(); ++i) {

1421

1422 Value reducedRes =

1424 vecForOp.getLoc(), vecForOp.getResult(i));

1425 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a vector reduction: "

1426 << reducedRes);

1427

1428

1429 Value origInit = forOp.getOperand(forOp.getNumControlOperands() + i);

1430 Value finalRes = reducedRes;

1432 finalRes =

1434 reducedRes.getLoc(), reducedRes, origInit);

1435 state.registerLoopResultScalarReplacement(forOp.getResult(i), finalRes);

1436 }

1437 }

1438

1439 if (isLoopVecDim)

1440 state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;

1441

1442

1443

1444 state.builder.setInsertionPointToStart(vecForOp.getBody());

1445

1446

1447

1448 if (isLoopVecDim && forOp.getNumIterOperands() > 0)

1450

1451 return vecForOp;

1452 }

1453

1454

1455

1456

1460 vectorTypes.push_back(

1461 VectorType::get(state.strategy->vectorSizes, result.getType()));

1462

1466 if (!vecOperand) {

1467 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize\n");

1468 return nullptr;

1469 }

1470 vectorOperands.push_back(vecOperand);

1471 }

1472

1473

1474

1475

1476

1477

1480 vectorOperands, vectorTypes, op->getAttrs());

1481 state.registerOpVectorReplacement(op, vecOp);

1482 return vecOp;

1483 }

1484

1485

1486

1487

1488

1492 Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();

1493

1494

1495

1496

1497

1498

1499

1500

1501

1502

1503 if (Value mask = state.vecLoopToMask.lookup(newParentOp)) {

1504 state.builder.setInsertionPoint(newYieldOp);

1505 for (unsigned i = 0; i < newYieldOp->getNumOperands(); ++i) {

1508 cast(newParentOp).getRegionIterArgs(), i, combinerOps);

1509 assert(reducedVal && "expect non-null value for parallel reduction loop");

1510 assert(combinerOps.size() == 1 && "expect only one combiner op");

1511

1512 Value neutralVal = cast(newParentOp).getInits()[i];

1513 state.builder.setInsertionPoint(combinerOps.back());

1514 Value maskedReducedVal = state.builder.createarith::SelectOp(

1515 reducedVal.getLoc(), mask, reducedVal, neutralVal);

1516 LLVM_DEBUG(

1517 dbgs() << "\n[early-vect]+++++ masking an input to a binary op that"

1518 "produces value for a yield Op: "

1519 << maskedReducedVal);

1520 combinerOps.back()->replaceUsesOfWith(reducedVal, maskedReducedVal);

1521 }

1522 }

1523

1524 state.builder.setInsertionPointAfter(newParentOp);

1525 return newYieldOp;

1526 }

1527

1528

1529

1530

1531

1532

1533

1534

1535

1536

1539

1540 assert(!isavector::TransferReadOp(op) &&

1541 "vector.transfer_read cannot be further vectorized");

1542 assert(!isavector::TransferWriteOp(op) &&

1543 "vector.transfer_write cannot be further vectorized");

1544

1545 if (auto loadOp = dyn_cast(op))

1547 if (auto storeOp = dyn_cast(op))

1549 if (auto forOp = dyn_cast(op))

1551 if (auto yieldOp = dyn_cast(op))

1553 if (auto constant = dyn_castarith::ConstantOp(op))

1555 if (auto applyOp = dyn_cast(op))

1557

1558

1560 return nullptr;

1561

1562 return widenOp(op, state);

1563 }

1564

1565

1566

1567

1568

1569 static void

1572

1573 assert(currentLevel <= loops.size() && "Unexpected currentLevel");

1574 if (currentLevel == loops.size())

1575 loops.emplace_back();

1576

1577

1578 loops[currentLevel].push_back(cast(match.getMatchedOperation()));

1581 }

1582 }

1583

1584

1585

1586

1587

1588

1589 static void

1593 }

1594

1595

1596

1597 static LogicalResult

1600 assert(loops[0].size() == 1 && "Expected single root loop");

1601 AffineForOp rootLoop = loops[0][0];

1603 state.builder.setInsertionPointAfter(rootLoop);

1604 state.strategy = &strategy;

1605

1606

1607

1608

1609

1610

1611

1612

1614 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");

1615 return failure();

1616 }

1617

1618

1619

1620

1621

1622

1623

1624

1625

1627 LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);

1629 if (!vectorOp) {

1630 LLVM_DEBUG(

1631 dbgs() << "[early-vect]+++++ failed vectorizing the operation: "

1632 << *op << "\n");

1634 }

1635

1637 });

1638

1639 if (opVecResult.wasInterrupted()) {

1640 LLVM_DEBUG(dbgs() << "[early-vect]+++++ failed vectorization for: "

1641 << rootLoop << "\n");

1642

1643 auto vecRootLoopIt = state.opVectorReplacement.find(rootLoop);

1644 if (vecRootLoopIt != state.opVectorReplacement.end())

1645 eraseLoopNest(cast(vecRootLoopIt->second));

1646

1647 return failure();

1648 }

1649

1650

1651

1652 for (auto resPair : state.loopResultScalarReplacement)

1653 resPair.first.replaceAllUsesWith(resPair.second);

1654

1655 assert(state.opVectorReplacement.count(rootLoop) == 1 &&

1656 "Expected vector replacement for loop nest");

1657 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");

1658 LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorization result:\n"

1659 << *state.opVectorReplacement[rootLoop]);

1660

1661

1662 state.finishVectorizationPattern(rootLoop);

1663 return success();

1664 }

1665

1666

1667

1668

1671 std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize;

1674 }

1675

1676

1677

1678

1679

1683 assert(intersectionBuckets.empty() && "Expected empty output");

1684

1686

1687 for (const NestedMatch &match : matches) {

1688 AffineForOp matchRoot = cast(match.getMatchedOperation());

1689 bool intersects = false;

1690 for (int i = 0, end = intersectionBuckets.size(); i < end; ++i) {

1691 AffineForOp bucketRoot = bucketRoots[i];

1692

1693 if (bucketRoot->isAncestor(matchRoot)) {

1694 intersectionBuckets[i].push_back(match);

1695 intersects = true;

1696 break;

1697 }

1698

1699

1700 if (matchRoot->isAncestor(bucketRoot)) {

1701 bucketRoots[i] = matchRoot;

1702 intersectionBuckets[i].push_back(match);

1703 intersects = true;

1704 break;

1705 }

1706 }

1707

1708

1709

1710 if (!intersects) {

1711 bucketRoots.push_back(matchRoot);

1712 intersectionBuckets.emplace_back();

1713 intersectionBuckets.back().push_back(match);

1714 }

1715 }

1716 }

1717

1718

1719

1720

1721

1722

1723

1728 assert((reductionLoops.empty() || vectorSizes.size() == 1) &&

1729 "Vectorizing reductions is supported only for 1-D vectors");

1730

1731

1732 std::optional pattern =

1733 makePattern(loops, vectorSizes.size(), fastestVaryingPattern);

1734 if (!pattern) {

1735 LLVM_DEBUG(dbgs() << "\n[early-vect] pattern couldn't be computed\n");

1736 return;

1737 }

1738

1739 LLVM_DEBUG(dbgs() << "\n******************************************");

1740 LLVM_DEBUG(dbgs() << "\n******************************************");

1741 LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on parent op\n");

1742 LLVM_DEBUG(dbgs() << *parentOp << "\n");

1743

1744 unsigned patternDepth = pattern->getDepth();

1745

1746

1747

1749 pattern->match(parentOp, &allMatches);

1750 std::vector<SmallVector<NestedMatch, 8>> intersectionBuckets;

1752

1753

1754

1755

1756 for (auto &intersectingMatches : intersectionBuckets) {

1757 for (NestedMatch &match : intersectingMatches) {

1759

1760 strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());

1763 patternDepth, &strategy))) {

1764 continue;

1765 }

1767 &strategy);

1768

1769

1770

1771

1773 break;

1774 }

1775 }

1776

1777 LLVM_DEBUG(dbgs() << "\n");

1778 }

1779

1780

1781

1782 void Vectorize::runOnOperation() {

1783 func::FuncOp f = getOperation();

1784 if (!fastestVaryingPattern.empty() &&

1785 fastestVaryingPattern.size() != vectorSizes.size()) {

1786 f.emitRemark("Fastest varying pattern specified with different size than "

1787 "the vector size.");

1788 return signalPassFailure();

1789 }

1790

1791 if (vectorizeReductions && vectorSizes.size() != 1) {

1792 f.emitError("Vectorizing reductions is supported only for 1-D vectors.");

1793 return signalPassFailure();

1794 }

1795

1796 if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {

1797 f.emitError("Vectorization factor must be greater than zero.");

1798 return signalPassFailure();

1799 }

1800

1803

1804

1805

1806 if (vectorizeReductions) {

1807 f.walk([&parallelLoops, &reductionLoops](AffineForOp loop) {

1810 parallelLoops.insert(loop);

1811

1812 if (!reductions.empty())

1813 reductionLoops[loop] = reductions;

1814 }

1815 });

1816 } else {

1817 f.walk([&parallelLoops](AffineForOp loop) {

1819 parallelLoops.insert(loop);

1820 });

1821 }

1822

1823

1825 vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,

1826 reductionLoops);

1827 }

1828

1829

1830

1831

1832

1833

1834

1835 static LogicalResult

1837

1838 if (loops.empty())

1839 return failure();

1840

1841

1842 if (loops[0].size() != 1)

1843 return failure();

1844

1845

1846 for (int i = 1, end = loops.size(); i < end; ++i) {

1847 for (AffineForOp loop : loops[i]) {

1848

1849

1850 if (none_of(loops[i - 1], [&](AffineForOp maybeParent) {

1851 return maybeParent->isProperAncestor(loop);

1852 }))

1853 return failure();

1854

1855

1856

1857 for (AffineForOp sibling : loops[i]) {

1858 if (sibling->isProperAncestor(loop))

1859 return failure();

1860 }

1861 }

1862 }

1863

1864 return success();

1865 }

1866

1867

1868

1869

1870

1871

1872

1873

1874

1875

1880

1882 vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern,

1883 reductionLoops);

1884 }

1885

1886

1887

1888

1889

1890

1891

1892

1893

1894

1895

1896

1897

1898

1899

1900

1901

1902

1903

1904

1905

1906

1907

1908

1909

1910

1911

1912

1913

1914

1915

1916

1917

1918

1919

1920

1924

1927 return failure();

1929 }

union mlir::linalg::@1203::ArityGroupAndKind::Kind kind

static Operation * vectorizeAffineStore(AffineStoreOp storeOp, VectorizationState &state)

Vectorizes an affine store with the vectorization strategy in 'state' by generating a 'vector....

static Operation * vectorizeAffineForOp(AffineForOp forOp, VectorizationState &state)

Vectorizes a loop with the vectorization strategy in 'state'.

static LogicalResult vectorizeRootMatch(NestedMatch m, const VectorizationStrategy &strategy)

Extracts the matched loops and vectorizes them following a topological order.

static LogicalResult verifyLoopNesting(const std::vector< SmallVector< AffineForOp, 2 >> &loops)

Verify that affine loops in 'loops' meet the nesting criteria expected by SuperVectorizer:

static void getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel, std::vector< SmallVector< AffineForOp, 2 >> &loops)

Recursive implementation to convert all the nested loops in 'match' to a 2D vector container that pre...

static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern, unsigned patternDepth, VectorizationStrategy *strategy)

static Operation * vectorizeOneOperation(Operation *op, VectorizationState &state)

Encodes Operation-specific behavior for vectorization.

static bool isNeutralElementConst(arith::AtomicRMWKind reductionKind, Value value, VectorizationState &state)

Returns true if value is a constant equal to the neutral element of the given vectorizable reduction.

static Operation * vectorizeUniform(Value uniformVal, VectorizationState &state)

Generates a broadcast op for the provided uniform value using the vectorization strategy in 'state'.

static Operation * vectorizeAffineYieldOp(AffineYieldOp yieldOp, VectorizationState &state)

Vectorizes a yield operation by widening its types.

static void computeIntersectionBuckets(ArrayRef< NestedMatch > matches, std::vector< SmallVector< NestedMatch, 8 >> &intersectionBuckets)

Traverses all the loop matches and classifies them into intersection buckets.

static LogicalResult analyzeProfitability(ArrayRef< NestedMatch > matches, unsigned depthInPattern, unsigned patternDepth, VectorizationStrategy *strategy)

Implements a simple strawman strategy for vectorization.

static FilterFunctionType isVectorizableLoopPtrFactory(const DenseSet< Operation * > &parallelLoops, int fastestVaryingMemRefDimension)

Forward declaration.

static Operation * widenOp(Operation *op, VectorizationState &state)

Vectorizes arbitrary operation by plain widening.

static bool isIVMappedToMultipleIndices(ArrayRef< Value > indices, const DenseMap< Operation *, unsigned > &loopToVectorDim)

Returns true if any vectorized loop IV drives more than one index.

static arith::ConstantOp vectorizeConstant(arith::ConstantOp constOp, VectorizationState &state)

Tries to transform a scalar constant into a vector constant.

static bool isUniformDefinition(Value value, const VectorizationStrategy *strategy)

Returns true if the provided value is vector uniform given the vectorization strategy.

static void eraseLoopNest(AffineForOp forOp)

Erases a loop nest, including all its nested operations.

static VectorType getVectorType(Type scalarTy, const VectorizationStrategy *strategy)

Returns the vector type resulting from applying the provided vectorization strategy on the scalar typ...

static void getMatchedAffineLoops(NestedMatch match, std::vector< SmallVector< AffineForOp, 2 >> &loops)

Converts all the nested loops in 'match' to a 2D vector container that preserves the relative nesting...

static Value vectorizeOperand(Value operand, VectorizationState &state)

Tries to vectorize a given operand by applying the following logic:

static arith::ConstantOp createInitialVector(arith::AtomicRMWKind reductionKind, Value oldOperand, VectorizationState &state)

Creates a constant vector filled with the neutral elements of the given reduction.

static LogicalResult vectorizeLoopNest(std::vector< SmallVector< AffineForOp, 2 >> &loops, const VectorizationStrategy &strategy)

Internal implementation to vectorize affine loops from a single loop nest using an n-D vectorization ...

static NestedPattern & vectorTransferPattern()

static Operation * vectorizeAffineApplyOp(AffineApplyOp applyOp, VectorizationState &state)

We have no need to vectorize affine.apply.

static void vectorizeLoops(Operation *parentOp, DenseSet< Operation * > &loops, ArrayRef< int64_t > vectorSizes, ArrayRef< int64_t > fastestVaryingPattern, const ReductionLoopMap &reductionLoops)

Internal implementation to vectorize affine loops in 'loops' using the n-D vectorization factors in '...

static void computeMemoryOpIndices(Operation *op, AffineMap map, ValueRange mapOperands, VectorizationState &state, SmallVectorImpl< Value > &results)

static Operation * vectorizeAffineLoad(AffineLoadOp loadOp, VectorizationState &state)

Vectorizes an affine load with the vectorization strategy in 'state' by generating a 'vector....

static Value createMask(AffineForOp vecForOp, VectorizationState &state)

Creates a mask used to filter out garbage elements in the last iteration of unaligned loops.

static std::optional< NestedPattern > makePattern(const DenseSet< Operation * > &parallelLoops, int vectorRank, ArrayRef< int64_t > fastestVaryingPattern)

Creates a vectorization pattern from the command line arguments.

static AffineMap makePermutationMap(ArrayRef< Value > indices, const DenseMap< Operation *, unsigned > &enclosingLoopToVectorDim)

Constructs a permutation map from memref indices to vector dimension.

Base type for affine expression.

A multi-dimensional affine map Affine map's are immutable like Type's, and they are uniqued.

static AffineMap get(MLIRContext *context)

Returns a zero result affine map with no dimensions or symbols: () -> ().

unsigned getNumSymbols() const

unsigned getNumDims() const

ArrayRef< AffineExpr > getResults() const

unsigned getNumResults() const

Attributes are known-constant values of operations.

This class represents an argument of a Block.

static DenseElementsAttr get(ShapedType type, ArrayRef< Attribute > values)

Constructs a dense elements attribute from an array of element values.

This is a utility class for mapping one set of IR entities to another.

This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...

MLIRContext is the top-level object for a collection of MLIR operations.

RAII guard to reset the insertion point of the builder when destroyed.

This class helps build Operations.

StringAttr getIdentifier() const

Return the name of this operation as a StringAttr.

Operation is the basic unit of execution within MLIR.

OpResult getResult(unsigned idx)

Get the 'idx'th result of this operation.

unsigned getNumRegions()

Returns the number of regions held by this operation.

Location getLoc()

The source location the operation was defined or derived from.

unsigned getNumOperands()

Operation * getParentOp()

Returns the closest surrounding operation that contains this operation or nullptr if this is a top-le...

ArrayRef< NamedAttribute > getAttrs()

Return all of the attributes on this operation.

OperationName getName()

The name of an operation is the key identifier for it.

operand_range getOperands()

Returns an iterator on the underlying Value's.

result_range getResults()

void erase()

Remove this operation from its parent block and delete it.

unsigned getNumResults()

Return the number of results held by this operation.

Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...

bool isIntOrIndexOrFloat() const

Return true if this is an integer (of any signedness), index, or float type.

This class provides an abstraction over the different types of ranges over Values.

This class represents an instance of an SSA value in the MLIR system, representing a computable value...

bool use_empty() const

Returns true if this value has no uses.

Type getType() const

Return the type of this value.

Location getLoc() const

Return the location of this value.

Operation * getDefiningOp() const

If this value is the result of an operation, return the operation that defines it.

static WalkResult advance()

static WalkResult interrupt()

An NestedPattern captures nested patterns in the IR.

Operation * getMatchedOperation() const

ArrayRef< NestedMatch > getMatchedChildren()

RAII structure to transparently manage the bump allocator for NestedPattern and NestedMatch classes.

NestedPattern For(const NestedPattern &child)

NestedPattern Op(FilterFunctionType filter=defaultFilterFunction)

bool isVectorizableLoopBody(AffineForOp loop, NestedPattern &vectorTransferMatcher)

Checks whether the loop is structurally vectorizable; i.e.

DenseSet< Value, DenseMapInfo< Value > > getInvariantAccesses(Value iv, ArrayRef< Value > indices)

Given an induction variable iv of type AffineForOp and indices of type IndexType, returns the set of ...

AffineForOp getForInductionVarOwner(Value val)

Returns the loop parent of an induction variable.

AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)

Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...

std::function< bool(Operation &)> FilterFunctionType

A NestedPattern is a nested operation walker that:

void vectorizeAffineLoops(Operation *parentOp, llvm::DenseSet< Operation *, DenseMapInfo< Operation * >> &loops, ArrayRef< int64_t > vectorSizes, ArrayRef< int64_t > fastestVaryingPattern, const ReductionLoopMap &reductionLoops=ReductionLoopMap())

Vectorizes affine loops in 'loops' using the n-D vectorization factors in 'vectorSizes'.

bool isLoopParallel(AffineForOp forOp, SmallVectorImpl< LoopReduction > *parallelReductions=nullptr)

Returns true if ‘forOp’ is a parallel loop.

LogicalResult vectorizeAffineLoopNest(std::vector< SmallVector< AffineForOp, 2 >> &loops, const VectorizationStrategy &strategy)

External utility to vectorize affine loops from a single loop nest using an n-D vectorization strateg...

TypedAttr getIdentityValueAttr(AtomicRMWKind kind, Type resultType, OpBuilder &builder, Location loc, bool useOnlyFiniteValue=false)

Returns the identity value attribute associated with an AtomicRMWKind op.

Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc, Value lhs, Value rhs)

Returns the value obtained by applying the reduction operation kind associated with a binary AtomicRM...

Value getVectorReductionOp(arith::AtomicRMWKind op, OpBuilder &builder, Location loc, Value vector)

Returns the value obtained by reducing the vector into a scalar using the operation kind associated w...

Include the generated interface declarations.

Value matchReduction(ArrayRef< BlockArgument > iterCarriedArgs, unsigned redPos, SmallVectorImpl< Operation * > &combinerOps)

Utility to match a generic reduction given a list of iteration-carried arguments, iterCarriedArgs and...

auto get(MLIRContext *context, Ts &&...params)

Helper method that injects context only if needed, this helps unify some of the attribute constructio...

Contains the vectorization state and related methods used across the vectorization process of a given...

Holds parameters to perform n-D vectorization on a single loop nest.

SmallVector< int64_t, 8 > vectorSizes

DenseMap< Operation *, unsigned > loopToVectorDim

ReductionLoopMap reductionLoops