Adding IR2Vec as an analysis pass by svkeerthy · Pull Request #134004 · llvm/llvm-project (original) (raw)
@llvm/pr-subscribers-llvm-analysis
Author: S. VenkataKeerthy (svkeerthy)
Changes
This PR introduces IR2Vec as an analysis pass. The changes include:
- Logic for generating Symbolic encodings.
- 75D learned vocabulary.
- lit tests.
(Planning to post an RFC; Will update the PR with RFC link)
Acknowledgements: contributors - https://github.com/IITH-Compilers/IR2Vec/graphs/contributors
Patch is 88.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134004.diff
10 Files Affected:
- (added) llvm/include/llvm/Analysis/IR2VecAnalysis.h (+131)
- (modified) llvm/lib/Analysis/CMakeLists.txt (+1)
- (added) llvm/lib/Analysis/IR2VecAnalysis.cpp (+429)
- (added) llvm/lib/Analysis/models/seedEmbeddingVocab75D.json (+65)
- (modified) llvm/lib/Passes/PassBuilder.cpp (+1)
- (modified) llvm/lib/Passes/PassRegistry.def (+3)
- (added) llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_vocab.json (+7)
- (added) llvm/test/Analysis/IR2Vec/Inputs/dummy_5D_vocab.json (+11)
- (added) llvm/test/Analysis/IR2Vec/basic.ll (+50)
- (added) llvm/test/Analysis/IR2Vec/if-else.ll (+38)
diff --git a/llvm/include/llvm/Analysis/IR2VecAnalysis.h b/llvm/include/llvm/Analysis/IR2VecAnalysis.h new file mode 100644 index 0000000000000..dd5c00a1168b8 --- /dev/null +++ b/llvm/include/llvm/Analysis/IR2VecAnalysis.h @@ -0,0 +1,131 @@ +//===- IR2VecAnalysis.h - IR2Vec Analysis Implementation -------- C++ --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. See the LICENSE file for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of IR2VecAnalysis that computes +/// IR2Vec Embeddings of the program. +/// +/// Program Embeddings are typically or derived-from a learned +/// representation of the program. Such embeddings are used to represent the +/// programs as input to machine learning algorithms. IR2Vec represents the +/// LLVM IR as embeddings. +/// +/// The IR2Vec algorithm is described in the following paper: +/// +/// IR2Vec: LLVM IR Based Scalable Program Embeddings, S. VenkataKeerthy, +/// Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar, Ramakrishna +/// Upadrasta, and Y. N. Srikant, ACM Transactions on Architecture and +/// Code Optimization (TACO), 2020. https://doi.org/10.1145/3418463. +/// https://arxiv.org/abs/1909.06228 +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_IR2VECANALYSIS_H +#define LLVM_ANALYSIS_IR2VECANALYSIS_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/IR/PassManager.h" +#include + +namespace llvm { + +class Module; +class BasicBlock; +class Instruction; +class Function; + +namespace ir2vec { +using Embedding = std::vector; +// FIXME: Current the keys are strings. This can be changed to +// use integers for cheaper lookups. +using Vocab = std::map<std::string, Embedding>; +} // namespace ir2vec + +class IR2VecVocabResult; +class IR2VecResult; + +/// This analysis provides the vocabulary for IR2Vec. The vocabulary provides a +/// mapping between an entity of the IR (like opcode, type, argument, etc.) and +/// its corresponding embedding. +class IR2VecVocabAnalysis : public AnalysisInfoMixin {
- unsigned DIM = 0;
- ir2vec::Vocab Vocabulary;
- Error readVocabulary();
- +public:
- static AnalysisKey Key;
- IR2VecVocabAnalysis() = default;
- using Result = IR2VecVocabResult;
- Result run(Module &M, ModuleAnalysisManager &MAM); +};
- +class IR2VecVocabResult {
- ir2vec::Vocab Vocabulary;
- bool Valid = false;
- unsigned DIM = 0;
- +public:
- IR2VecVocabResult() = default;
- IR2VecVocabResult(ir2vec::Vocab &&Vocabulary, unsigned Dim);
- // Helper functions
- bool isValid() const { return Valid; }
- const ir2vec::Vocab &getVocabulary() const;
- unsigned getDimension() const { return DIM; }
- bool invalidate(Module &M, const PreservedAnalyses &PA,
ModuleAnalysisManager::Invalidator &Inv);
+}; + +class IR2VecResult {
- SmallMapVector<const Instruction *, ir2vec::Embedding, 128> InstVecMap;
- SmallMapVector<const BasicBlock *, ir2vec::Embedding, 16> BBVecMap;
- ir2vec::Embedding FuncVector;
- unsigned DIM = 0;
- bool Valid = false;
- +public:
- IR2VecResult() = default;
- IR2VecResult(
SmallMapVector<const Instruction *, ir2vec::Embedding, 128> &&InstMap,
SmallMapVector<const BasicBlock *, ir2vec::Embedding, 16> &&BBMap,
ir2vec::Embedding &&FuncVector, unsigned Dim);
- bool isValid() const { return Valid; }
- const SmallMapVector<const Instruction *, ir2vec::Embedding, 128> &
- getInstVecMap() const;
- const SmallMapVector<const BasicBlock *, ir2vec::Embedding, 16> &
- getBBVecMap() const;
- const ir2vec::Embedding &getFunctionVector() const;
- unsigned getDimension() const; +};
- +/// This analysis provides the IR2Vec embeddings for instructions, basic blocks, +/// and functions. +class IR2VecAnalysis : public AnalysisInfoMixin { +public:
- IR2VecAnalysis() = default;
- static AnalysisKey Key;
- using Result = IR2VecResult;
- Result run(Function &F, FunctionAnalysisManager &FAM); +};
- +/// This pass prints the IR2Vec embeddings for instructions, basic blocks, and +/// functions. +class IR2VecPrinterPass : public PassInfoMixin {
- raw_ostream &OS;
- void printVector(const ir2vec::Embedding &Vec) const;
- +public:
- explicit IR2VecPrinterPass(raw_ostream &OS) : OS(OS) {}
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
- static bool isRequired() { return true; } +};
- +} // namespace llvm
- +#endif // LLVM_ANALYSIS_IR2VECANALYSIS_H
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index fbf3b587d6bd2..8a6399f756f27 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -67,6 +67,7 @@ add_llvm_component_library(LLVMAnalysis GlobalsModRef.cpp GuardUtils.cpp HeatUtils.cpp
- IR2VecAnalysis.cpp IRSimilarityIdentifier.cpp IVDescriptors.cpp IVUsers.cpp diff --git a/llvm/lib/Analysis/IR2VecAnalysis.cpp b/llvm/lib/Analysis/IR2VecAnalysis.cpp new file mode 100644 index 0000000000000..1ff233145769f --- /dev/null +++ b/llvm/lib/Analysis/IR2VecAnalysis.cpp @@ -0,0 +1,429 @@ +//===- IR2VecAnalysis.cpp - IR2Vec Analysis Implementation ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. See the LICENSE file for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the IR2Vec algorithm. +/// +//===----------------------------------------------------------------------===//
- +#include "llvm/Analysis/IR2VecAnalysis.h"
- +#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/MemoryBuffer.h" + +using namespace llvm; +using namespace ir2vec; + +#define DEBUG_TYPE "ir2vec" + +STATISTIC(DataMissCounter, "Number of data misses in the vocabulary"); + +/// IR2Vec computes two kinds of embeddings: Symbolic and Flow-aware. +/// Symbolic embeddings capture the "syntactic" and "statistical correlation" +/// of the IR entities. Flow-aware embeddings build on top of symbolic +/// embeddings and additionally capture the flow information in the IR. +/// IR2VecKind is used to specify the type of embeddings to generate. +// FIXME: Currently we support only Symbolic. Add support for +// Flow-aware in upcoming patches. +enum class IR2VecKind { Symbolic, Flowaware }; + +static cl::OptionCategory IR2VecAnalysisCategory("IR2Vec Analysis Options"); + +cl::opt
- IR2VecMode("ir2vec-mode",
cl::desc("Choose type of embeddings to generate:"),
cl::values(clEnumValN(IR2VecKind::Symbolic, "symbolic",
"Generates symbolic embeddings"),
clEnumValN(IR2VecKind::Flowaware, "flowaware",
"Generates flow-aware embeddings")),
cl::init(IR2VecKind::Symbolic), cl::cat(IR2VecAnalysisCategory));
- +// FIXME: Use a default vocab when not specified +static cl::optstd::string
- VocabFile("ir2vec-vocab-path", cl::Optional,
cl::desc("Path to the vocabulary file for IR2Vec"), cl::init(""),
cl::cat(IR2VecAnalysisCategory));
- +AnalysisKey IR2VecVocabAnalysis::Key; +AnalysisKey IR2VecAnalysis::Key;
- +// ==----------------------------------------------------------------------===// +// Embeddings and its subclasses +//===----------------------------------------------------------------------===//
- +namespace { +/// Embeddings provides the interface to generate vector representations for +/// instructions, basic blocks, and functions. The vector +/// representations are generated using IR2Vec algorithms. +/// +/// The Embeddings class is an abstract class and it is intended to be +/// subclassed for different IR2Vec algorithms like Symbolic and Flow-aware. +class Embeddings { +protected:
- const Function &F;
- Vocab Vocabulary;
- /// Weights for different entities (like opcode, arguments, types)
- /// in the IR instructions to generate the vector representation.
- // FIXME: Defaults to the values used in the original algorithm. Can be
- // parameterized later.
- float WO = 1.0, WT = 0.5, WA = 0.2;
- /// Dimension of the vector representation; captured from the input vocabulary
- unsigned DIM = 300;
- // Utility maps - these are used to store the vector representations of
- // instructions, basic blocks and functions.
- Embedding FuncVector;
- SmallMapVector<const BasicBlock *, Embedding, 16> BBVecMap;
- SmallMapVector<const Instruction *, Embedding, 128> InstVecMap;
- Embeddings(const Function &F, const Vocab &Vocabulary, unsigned DIM)
: F(F), Vocabulary(Vocabulary), DIM(DIM) {}
- /// Lookup vocabulary for a given Key. If the key is not found, it returns a
- /// zero vector.
- Embedding lookupVocab(const std::string &Key);
- +public:
- virtual ~Embeddings() = default;
- /// Top level function to compute embeddings. Given a function, it
- /// generates embeddings for all the instructions and basic blocks in that
- /// function. Logic of computing the embeddings is specific to the kind of
- /// embeddings being computed.
- virtual void computeEmbeddings() = 0;
- /// Returns a map containing instructions and the corresponding vector
- /// representations for a given module corresponding to the IR2Vec
- /// algorithm.
- const SmallMapVector<const Instruction *, Embedding, 128> &
- getInstVecMap() const {
- return InstVecMap;
- }
- /// Returns a map containing basic block and the corresponding vector
- /// representations for a given module corresponding to the IR2Vec
- /// algorithm.
- const SmallMapVector<const BasicBlock *, Embedding, 16> &getBBVecMap() const {
- return BBVecMap;
- }
- /// Returns the vector representation for a given function corresponding to
- /// the IR2Vec algorithm.
- const Embedding &getFunctionVector() const { return FuncVector; } +};
- +/// Class for computing the Symbolic embeddings of IR2Vec +class Symbolic : public Embeddings { +private:
- /// Utility function to compute the vector representation for a given basic
- /// block.
- Embedding computeBB2Vec(const BasicBlock &BB);
- /// Utility function to compute the vector representation for a given
- /// function.
- Embedding computeFunc2Vec();
- +public:
- Symbolic(const Function &F, const Vocab &Vocabulary, unsigned DIM)
: Embeddings(F, Vocabulary, DIM) {
- FuncVector = Embedding(DIM, 0);
- }
- void computeEmbeddings() override; +};
- +/// Scales the vector Vec by Factor +void scaleVector(Embedding &Vec, const float Factor) {
- std::transform(Vec.begin(), Vec.end(), Vec.begin(),
[Factor](double X) { return X * Factor; });
+} + +/// Adds two vectors: Vec += Vec2 +void addVectors(Embedding &Vec, const Embedding &Vec2) {
- std::transform(Vec.begin(), Vec.end(), Vec2.begin(), Vec.begin(),
std::plus<double>());
+} + +// FIXME: Currently lookups are string based. Use numeric Keys +// for efficiency. +Embedding Embeddings::lookupVocab(const std::string &Key) {
- Embedding Vec(DIM, 0);
- // FIXME: Use zero vectors in vocab and assert failure for
- // unknown entities rather than silently returning zeroes here.
- if (Vocabulary.find(Key) == Vocabulary.end()) {
- LLVM_DEBUG(errs() << "cannot find key in map : " << Key << "\n");
- DataMissCounter++;
- } else {
- Vec = Vocabulary[Key];
- }
- return Vec; +}
- +void Symbolic::computeEmbeddings() {
- if (F.isDeclaration())
- return;
- for (auto &BB : F) {
- auto It = BBVecMap.find(&BB);
- if (It != BBVecMap.end())
continue;
- BBVecMap[&BB] = computeBB2Vec(BB);
- addVectors(FuncVector, BBVecMap[&BB]);
- } +}
- +Embedding Symbolic::computeBB2Vec(const BasicBlock &BB) {
- Embedding BBVector(DIM, 0);
- for (auto &I : BB) {
- Embedding InstVector(DIM, 0);
- auto Vec = lookupVocab(I.getOpcodeName());
- scaleVector(Vec, WO);
- addVectors(InstVector, Vec);
- auto Type = I.getType();
- if (Type->isVoidTy()) {
Vec = lookupVocab("voidTy");
- } else if (Type->isFloatingPointTy()) {
Vec = lookupVocab("floatTy");
- } else if (Type->isIntegerTy()) {
Vec = lookupVocab("integerTy");
- } else if (Type->isFunctionTy()) {
Vec = lookupVocab("functionTy");
- } else if (Type->isStructTy()) {
Vec = lookupVocab("structTy");
- } else if (Type->isArrayTy()) {
Vec = lookupVocab("arrayTy");
- } else if (Type->isPointerTy()) {
Vec = lookupVocab("pointerTy");
- } else if (Type->isVectorTy()) {
Vec = lookupVocab("vectorTy");
- } else if (Type->isEmptyTy()) {
Vec = lookupVocab("emptyTy");
- } else if (Type->isLabelTy()) {
Vec = lookupVocab("labelTy");
- } else if (Type->isTokenTy()) {
Vec = lookupVocab("tokenTy");
- } else if (Type->isMetadataTy()) {
Vec = lookupVocab("metadataTy");
- } else {
Vec = lookupVocab("unknownTy");
- }
- scaleVector(Vec, WT);
- addVectors(InstVector, Vec);
- for (auto &Op : I.operands()) {
Embedding Vec;
if (isa<Function>(Op)) {
Vec = lookupVocab("function");
} else if (isa<PointerType>(Op->getType())) {
Vec = lookupVocab("pointer");
} else if (isa<Constant>(Op)) {
Vec = lookupVocab("constant");
} else {
Vec = lookupVocab("variable");
}
scaleVector(Vec, WA);
addVectors(InstVector, Vec);
- }
- InstVecMap[&I] = InstVector;
- addVectors(BBVector, InstVector);
- }
- return BBVector; +} +} // namespace
- +// ==----------------------------------------------------------------------===// +// IR2VecVocabResult and IR2VecVocabAnalysis +//===----------------------------------------------------------------------===//
- +IR2VecVocabResult::IR2VecVocabResult(ir2vec::Vocab &&Vocabulary, unsigned Dim)
- : Vocabulary(std::move(Vocabulary)), Valid(true), DIM(Dim) {}
- +const ir2vec::Vocab &IR2VecVocabResult::getVocabulary() const {
- assert(Valid);
- return Vocabulary; +}
- +// For now, assume vocabulary is stable unless explicitly invalidated. +bool IR2VecVocabResult::invalidate(Module &M, const PreservedAnalyses &PA,
ModuleAnalysisManager::Invalidator &Inv) {
- auto PAC = PA.getChecker();
- return !(PAC.preservedWhenStateless()); +}
- +// FIXME: Make this optional. We can avoid file reads +// by auto-generating the vocabulary during the build time. +Error IR2VecVocabAnalysis::readVocabulary() {
- auto BufOrError = MemoryBuffer::getFileOrSTDIN(VocabFile, /IsText=/true);
- if (!BufOrError) {
- return createFileError(VocabFile, BufOrError.getError());
- }
- auto Content = BufOrError.get()->getBuffer();
- json::Path::Root Path("");
- Expectedjson::Value ParsedVocabValue = json::parse(Content);
- if (!ParsedVocabValue)
- return ParsedVocabValue.takeError();
- bool Res = json::fromJSON(*ParsedVocabValue, Vocabulary, Path);
- if (!Res) {
- return createStringError(errc::illegal_byte_sequence,
"Unable to parse the vocabulary");
- }
- assert(Vocabulary.size() > 0 && "Vocabulary is empty");
- unsigned Dim = Vocabulary.begin()->second.size();
- assert(Dim > 0 && "Dimension of vocabulary is zero");
- assert(std::all_of(Vocabulary.begin(), Vocabulary.end(),
[Dim](const std::pair<StringRef, Embedding> &Entry) {
return Entry.second.size() == Dim;
}) &&
"All vectors in the vocabulary are not of the same dimension");
- this->DIM = Dim;
- return Error::success(); +}
- +IR2VecVocabAnalysis::Result +IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
- auto Ctx = &M.getContext();
- if (VocabFile.empty()) {
- // FIXME: Use default vocabulary
- Ctx->emitError("IR2Vec vocabulary file path not specified");
- return IR2VecVocabResult(); // Return invalid result
- }
- if (auto Err = readVocabulary()) {
- handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
Ctx->emitError("Error reading vocabulary: " + EI.message());
- });
- return IR2VecVocabResult();
- }
- return IR2VecVocabResult(std::move(Vocabulary), DIM); +}
- +// ==----------------------------------------------------------------------===// +// IR2VecResult and IR2VecAnalysis +//===----------------------------------------------------------------------===//
- +IR2VecResult::IR2VecResult(
- SmallMapVector<const Instruction *, Embedding, 128> &&InstMap,
- SmallMapVector<const BasicBlock *, Embedding, 16> &&BBMap,
- Embedding &&FuncVector, unsigned Dim)
- : InstVecMap(std::move(InstMap)), BBVecMap(std::move(BBMap)),
FuncVector(std::move(FuncVector)), DIM(Dim), Valid(true) {}
- +const SmallMapVector<const Instruction *, Embedding, 128> & +IR2VecResult::getInstVecMap() const {
- assert(Valid);
- return InstVecMap; +} +const SmallMapVector<const BasicBlock *, Embedding, 16> & +IR2VecResult::getBBVecMap() const {
- assert(Valid);
- return BBVecMap; +} +const Embedding &IR2VecResult::getFunctionVector() const {
- assert(Valid);
- return FuncVector; +} +unsigned IR2VecResult::getDimension() const { return DIM; }
- +IR2VecAnalysis::Result IR2VecAnalysis::run(Function &F,
FunctionAnalysisManager &FAM) {
- auto *VocabRes = FAM.getResult(F)
.getCachedResult<IR2VecVocabAnalysis>(*F.getParent());
- auto Ctx = &F.getContext();
- if (!VocabRes->isValid()) {
- Ctx->emitError("IR2Vec vocabulary is invalid");
- return IR2VecResult();
- }
- auto Dim = VocabRes->getDimension();
- if (Dim <= 0) {
- Ctx->emitError("IR2Vec vocabulary dimension is zero");
- return IR2VecResult();
- }
- auto Vocabulary = VocabRes->getVocabulary();
- std::unique_ptr Emb;
- switch (IR2VecMode) {
- case IR2VecKind::Symbolic:
- Emb = std::make_unique(F, Vocabulary, Dim);
- break;
- case IR2VecKind::Flowaware:
- // FIXME: Add support for flow-aware embeddings
- llvm_unreachable("Flow-aware embeddings are not supported yet");
- break;
- default:
- llvm_unreachable("Invalid IR2Vec mode");
- }
- Emb->computeEmbeddings();
- auto InstMap = Emb->getInstVecMap();
- auto BBMap = Emb->getBBVecMap();
- auto FuncVec = Emb->getFunctionVector();
- return IR2VecResult(std::move(InstMap), std::move(BBMap), std::move(FuncVec),
Dim);
+} + +// ==----------------------------------------------------------------------===// +// IR2VecPrinterPass +//===----------------------------------------------------------------------===// + +void IR2VecPrinterPass::printVector(const Embedding &Vec) const {
- OS << " [";
- for (auto &Elem : Vec)
- OS << " " << format("%.2f", Elem) << " ";
- OS << "]\n"; +}
- +PreservedAnalyses IR2VecPrinterPass::run(Module &M,
ModuleAnalysisManager &MAM) {
- auto IR2VecVocabResult = MAM.getResult(M);
- assert(IR2VecVocabResult.isValid() && "Vocab is invalid");
- for (Function &F : M) {
- auto &FAM =
MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto IR2VecRes = FAM.getResult(F);
- if (!IR2VecRes.isValid()) {
auto Ctx = &F.getContext();
Ctx->emitError("IR2Vec embeddings are invalid");
return PreservedAnalyses::all();
- }
- OS << "IR2Vec embeddings for function " << F.getName() << ":\n";
- OS << "Function vector: ";
- printVector(IR2VecRes.getFunctionVector());
- OS << "Basic block vectors:\n";
- for (const auto &BBVector : IR2VecRes.getBBVecMap()) {
OS << "Basic block: " << BBVector.first...
[truncated]