Add amdgpu target by Flakebi · Pull Request #134740 · rust-lang/rust (original) (raw)

I'm not sure that size is really a driver for more instruction counts, but it does look like the mere possibility of cross-compiling to AMDGPU is enabling more passes/logic(?) even if presumably those don't do anything on x86. Maybe an optimization opportunity for LLVM/clang? It might be unavoidable with the architecture LLVM has today though.

Sampling a few cachegrind diffs:

helloworld:

--------------------------------------------------------------------------------
-- File:function summary
--------------------------------------------------------------------------------
  Ir______  file:function

<  575,695  ???:
   844,600    llvm::PassRegistry::enumerateWith(llvm::PassRegistrationListener*)
  -782,786    llvm::PassRegistry::enumerateWith(llvm::PassRegistrationListener*) [clone .warm]
   -69,691    llvm::FPPassManager::runOnFunction(llvm::Function&)
    60,129    llvm::MVT::getScalableVectorVT(llvm::MVT, unsigned int)
    54,986    ecache_evict
   -49,525    llvm::SelectionDAGISel::CodeGenAndEmitDAG()
   -41,251    std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > std::copy<llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlock*, 8u>, false, ll>
    40,624    llvm::StringMapImpl::RehashTable(unsigned int)
    39,256    llvm::StringMapImpl::LookupBucketFor(llvm::StringRef, unsigned int)
    38,978    edata_cache_get
    38,420    llvm::InstCombinerImpl::visitCallBase(llvm::CallBase&)
    37,509    llvm::AnalysisManager<llvm::LazyCallGraph::SCC, llvm::LazyCallGraph&>::invalidate(llvm::LazyCallGraph::SCC&, llvm::PreservedAnalyses const&)
    36,129    llvm::SelectionDAG::Legalize()
   -33,750    llvm::PassRegistry::registerPass(llvm::PassInfo const&, bool)
   -33,501    llvm::InstCombinerImpl::visitCallInst(llvm::CallInst&)
   -31,525    tcache_bin_flush_small
    31,343    llvm::PMTopLevelManager::findAnalysisPassInfo(void const*) const
    30,580    eset_remove

clap:

--------------------------------------------------------------------------------
-- File:function summary
--------------------------------------------------------------------------------
  Ir__________  file:function

<  209,305,246  ???:
  -230,519,105    llvm::computeKnownBitsFromContext(llvm::Value const*, llvm::KnownBits&, unsigned int, llvm::SimplifyQuery const&)
  -132,162,101    computeKnownBitsFromOperator(llvm::Operator const*, llvm::APInt const&, llvm::KnownBits&, unsigned int, llvm::SimplifyQuery const&) [clone
   124,686,235    computeKnownBitsFromOperator(llvm::Operator const*, llvm::APInt const&, llvm::KnownBits&, unsigned int, llvm::SimplifyQuery const&)
   114,060,111    llvm::LiveIntervalCalc::calculate(llvm::LiveInterval&, bool)
  -113,662,179    llvm::LiveIntervals::computeVirtRegs()
   104,218,878    llvm::PointerMayBeCaptured(llvm::Value const*, llvm::CaptureTracker*, unsigned int)
   -97,035,202    llvm::SelectionDAGISel::CodeGenAndEmitDAG()
   -90,098,002    llvm::AAResults::getModRefInfo(llvm::Instruction const*, std::optional<llvm::MemoryLocation> const&, llvm::AAQueryInfo&)
    89,433,445    llvm::RAGreedy::calculateRegionSplitCostAroundReg(unsigned short, llvm::AllocationOrder&, llvm::BlockFrequency&, unsigned int&, unsigned int&)
   -83,724,622    llvm::RAGreedy::calculateRegionSplitCost(llvm::LiveInterval const&, llvm::AllocationOrder&, llvm::BlockFrequency&, unsigned int&, bool)
    82,820,615    computeKnownBits(llvm::Value const*, llvm::APInt const&, llvm::KnownBits&, unsigned int, llvm::SimplifyQuery const&) [clone
   -72,563,818    llvm::ScheduleDAGSDNodes::BuildSchedGraph(llvm::AAResults*)
    71,100,555    llvm::SelectionDAG::Legalize()
   -70,774,808    std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > std::copy<llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlock*, 8u>, false, llvm::GraphTraits<llvm::BasicBlock*> >, std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > >(llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlo>
    68,499,576    llvm::ScheduleDAGSDNodes::AddSchedEdges()
   -67,376,856    (anonymous namespace)::TailRecursionEliminator::eliminate(llvm::Function&, llvm::TargetTransformInfo const*, llvm::AAResults*, llvm::OptimizationRemarkEmitter*, llvm::DomTreeUpdater&) [clone
    66,298,390    computePointerICmp(llvm::CmpInst::Predicate, llvm::Value*, llvm::Value*, llvm::SimplifyQuery const&)
    62,749,680    std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > std::__copy_move_a2<false, llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrSet<llvm::BasicBlock*, 8u>, false, llvm::GraphTraits<llvm::BasicBlock*> >, std::back_insert_iterator<llvm::SmallVector<llvm::BasicBlock*, 8u> > >(llvm::po_iterator<llvm::BasicBlock*, llvm::SmallPtrS>
   -62,208,865    simplifyICmpInst(unsigned int, llvm::Value*, llvm::Value*, llvm::SimplifyQuery const&, unsigned int) [clone
    59,398,553    llvm::SCCPInstVisitor::markUsersAsChanged(llvm::Value*)
   -58,896,104    llvm::RegAllocBase::allocatePhysRegs()
    57,899,944    llvm::RAGreedy::selectOrSplitImpl(llvm::LiveInterval const&, llvm::SmallVectorImpl<llvm::Register>&, llvm::SmallSet<llvm::Register, 16u, std::less<llvm::Register> >&, llvm::SmallVector<std::pair<llvm::LiveInterval const*, llvm::MCRegister>, 8u>&, unsigned int)