Add amdgpu target by Flakebi · Pull Request #134740 · rust-lang/rust (original) (raw)
I'm not sure that size is really a driver for more instruction counts, but it does look like the mere possibility of cross-compiling to AMDGPU is enabling more passes/logic(?) even if presumably those don't do anything on x86. Maybe an optimization opportunity for LLVM/clang? It might be unavoidable with the architecture LLVM has today though.
Sampling a few cachegrind diffs:
helloworld:
--------------------------------------------------------------------------------
-- File:function summary
--------------------------------------------------------------------------------
Ir______ file:function
< 575,695 ???:
844,600 llvm::PassRegistry::enumerateWith(llvm::PassRegistrationListener*)
-782,786 llvm::PassRegistry::enumerateWith(llvm::PassRegistrationListener*) [clone .warm]
-69,691 llvm::FPPassManager::runOnFunction(llvm::Function&)
60,129 llvm::MVT::getScalableVectorVT(llvm::MVT, unsigned int)
54,986 ecache_evict
-49,525 llvm::SelectionDAGISel::CodeGenAndEmitDAG()
-41,251 std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > std::copy<llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlock*, 8u>, false, ll>
40,624 llvm::StringMapImpl::RehashTable(unsigned int)
39,256 llvm::StringMapImpl::LookupBucketFor(llvm::StringRef, unsigned int)
38,978 edata_cache_get
38,420 llvm::InstCombinerImpl::visitCallBase(llvm::CallBase&)
37,509 llvm::AnalysisManager<llvm::LazyCallGraph::SCC, llvm::LazyCallGraph&>::invalidate(llvm::LazyCallGraph::SCC&, llvm::PreservedAnalyses const&)
36,129 llvm::SelectionDAG::Legalize()
-33,750 llvm::PassRegistry::registerPass(llvm::PassInfo const&, bool)
-33,501 llvm::InstCombinerImpl::visitCallInst(llvm::CallInst&)
-31,525 tcache_bin_flush_small
31,343 llvm::PMTopLevelManager::findAnalysisPassInfo(void const*) const
30,580 eset_remove
clap:
--------------------------------------------------------------------------------
-- File:function summary
--------------------------------------------------------------------------------
Ir__________ file:function
< 209,305,246 ???:
-230,519,105 llvm::computeKnownBitsFromContext(llvm::Value const*, llvm::KnownBits&, unsigned int, llvm::SimplifyQuery const&)
-132,162,101 computeKnownBitsFromOperator(llvm::Operator const*, llvm::APInt const&, llvm::KnownBits&, unsigned int, llvm::SimplifyQuery const&) [clone
124,686,235 computeKnownBitsFromOperator(llvm::Operator const*, llvm::APInt const&, llvm::KnownBits&, unsigned int, llvm::SimplifyQuery const&)
114,060,111 llvm::LiveIntervalCalc::calculate(llvm::LiveInterval&, bool)
-113,662,179 llvm::LiveIntervals::computeVirtRegs()
104,218,878 llvm::PointerMayBeCaptured(llvm::Value const*, llvm::CaptureTracker*, unsigned int)
-97,035,202 llvm::SelectionDAGISel::CodeGenAndEmitDAG()
-90,098,002 llvm::AAResults::getModRefInfo(llvm::Instruction const*, std::optional<llvm::MemoryLocation> const&, llvm::AAQueryInfo&)
89,433,445 llvm::RAGreedy::calculateRegionSplitCostAroundReg(unsigned short, llvm::AllocationOrder&, llvm::BlockFrequency&, unsigned int&, unsigned int&)
-83,724,622 llvm::RAGreedy::calculateRegionSplitCost(llvm::LiveInterval const&, llvm::AllocationOrder&, llvm::BlockFrequency&, unsigned int&, bool)
82,820,615 computeKnownBits(llvm::Value const*, llvm::APInt const&, llvm::KnownBits&, unsigned int, llvm::SimplifyQuery const&) [clone
-72,563,818 llvm::ScheduleDAGSDNodes::BuildSchedGraph(llvm::AAResults*)
71,100,555 llvm::SelectionDAG::Legalize()
-70,774,808 std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > std::copy<llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlock*, 8u>, false, llvm::GraphTraits<llvm::BasicBlock*> >, std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > >(llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlo>
68,499,576 llvm::ScheduleDAGSDNodes::AddSchedEdges()
-67,376,856 (anonymous namespace)::TailRecursionEliminator::eliminate(llvm::Function&, llvm::TargetTransformInfo const*, llvm::AAResults*, llvm::OptimizationRemarkEmitter*, llvm::DomTreeUpdater&) [clone
66,298,390 computePointerICmp(llvm::CmpInst::Predicate, llvm::Value*, llvm::Value*, llvm::SimplifyQuery const&)
62,749,680 std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > std::__copy_move_a2<false, llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlock*, 8u>, false, llvm::GraphTraits<llvm::BasicBlock*> >, std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > >(llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrS>
-62,208,865 simplifyICmpInst(unsigned int, llvm::Value*, llvm::Value*, llvm::SimplifyQuery const&, unsigned int) [clone
59,398,553 llvm::SCCPInstVisitor::markUsersAsChanged(llvm::Value*)
-58,896,104 llvm::RegAllocBase::allocatePhysRegs()
57,899,944 llvm::RAGreedy::selectOrSplitImpl(llvm::LiveInterval const&, llvm::SmallVectorImpl<llvm::Register>&, llvm::SmallSet<llvm::Register, 16u, std::less<llvm::Register> >&, llvm::SmallVector<std::pair<llvm::LiveInterval const*, llvm::MCRegister>, 8u>&, unsigned int)