From a6507c5ed9bd6268d5e4f81ab22c289acb64d468 Mon Sep 17 00:00:00 2001 From: jianghaibo Date: Wed, 3 Dec 2025 22:31:54 +0800 Subject: [PATCH] [LICM] Hoist conditional load with sve ffr When an illegal address is accessed, the ldnf1 instruction of sve will not trap, so this instruction can be used to hoist the load under conditional execution in a loop. --- .../include/llvm/Transforms/Utils/LoopUtils.h | 8 +- llvm/lib/Transforms/Scalar/LICM.cpp | 146 +++++++++++++++++- 2 files changed, 142 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index cc31fc79c2de..7303d3ca94b5 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -173,10 +173,10 @@ bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *, /// \p AllowSpeculation is whether values should be hoisted even if they are not /// guaranteed to execute in the loop, but are safe to speculatively execute. bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, - AssumptionCache *, TargetLibraryInfo *, Loop *, - MemorySSAUpdater &, ScalarEvolution *, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool, - bool AllowSpeculation); + AssumptionCache *, TargetLibraryInfo *, TargetTransformInfo *, + Loop *, MemorySSAUpdater &, ScalarEvolution *, + ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, + OptimizationRemarkEmitter *, bool, bool AllowSpeculation); /// Return true if the induction variable \p IV in a Loop whose latch is /// \p LatchBlock would become dead if the exit test \p Cond were removed. diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 71b567bc7c96..785afabf7141 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -60,6 +60,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -68,6 +69,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -128,6 +130,10 @@ static cl::opt DisableMovStoreInsOutsideOfLoopInSigFun( cl::desc( "Disable move store instruction outside of loop in signal function.")); +static cl::opt EnableHoistCondLoad( + "licm-hoist-cond-load", cl::Hidden, cl::init(true), + cl::desc("Enable hoisting of conditional loads in LICM")); + static cl::opt MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -202,6 +208,13 @@ using PointersAndHasReadsOutsideSet = static SmallVector collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L); +static ScalableVectorType *getSVEContainerType(Type *EltTy); + +static bool findConditionalLoad( + LoopInfo *LI, Loop *CurLoop, TargetTransformInfo *TTI, Instruction *I); + +static Instruction *replaceLoadWithLdnf(Instruction *I); + namespace { struct LoopInvariantCodeMotion { bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, @@ -456,8 +469,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, MSSAU, &SafetyInfo, Flags, ORE); Flags.setIsSink(false); if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, - MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, TTI, + L, MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, LicmAllowSpeculation); // Now that all loop invariants have been removed from the loop, promote any @@ -855,6 +868,97 @@ public: }; } // namespace +static Instruction *replaceLoadWithLdnf(Instruction *I) { + auto *LoadI = dyn_cast(I); + auto *PointerOp = LoadI->getPointerOperand(); + const DataLayout &DL = LoadI->getModule()->getDataLayout(); + auto *LITy = LoadI->getType(); + IRBuilder<> B(LoadI); + auto *EltTy = LITy->isPointerTy() ? B.getIntNTy(DL.getPointerSizeInBits()) + : LITy; + auto *SVTy = getSVEContainerType(EltTy); + assert(SVTy && "Unsupport type of load instruction"); + + auto *PredTy = ScalableVectorType::get(B.getInt1Ty(), SVTy->getMinNumElements()); + Value *Imm = B.getInt32(31); + CallInst *Pred = B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {Imm}); + + Type *PtrTy = LITy->getPointerTo(PointerOp->getType()->getPointerAddressSpace()); + if (PointerOp->getType() != PtrTy) + PointerOp = B.CreateBitCast(PointerOp, PtrTy); + + CallInst *Ldnf = + B.CreateIntrinsic(Intrinsic::aarch64_sve_ldnf1, {SVTy}, {Pred, PointerOp}); + + propagateMetadata(Ldnf, LoadI); + + Value *Scalar = B.CreateExtractElement(Ldnf, B.getInt64(0), "extract"); + if (LITy->isPointerTy()) { + Value *PtrValue = B.CreateIntToPtr(Scalar, PointerType::getUnqual(Scalar->getContext())); + return dyn_cast(PtrValue); + } + return dyn_cast(Scalar); +} + +static ScalableVectorType *getSVEContainerType(Type *EltTy) { + if (EltTy == Type::getDoubleTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 2); + + if(EltTy == Type::getFloatTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 4); + + if(EltTy == Type::getBFloatTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getHalfTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getInt64Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 2); + + if(EltTy == Type::getInt32Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 4); + + if(EltTy == Type::getInt16Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getInt8Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 16); + + return nullptr; +} + +static bool +findConditionalLoad(LoopInfo *LI, Loop *CurLoop, TargetTransformInfo *TTI, + Instruction *I) { + auto LoadI = dyn_cast(I); + if (!LoadI) + return false; + + if (LoadI->isAtomic() || LoadI->isVolatile()) + return false; + + Module *M = LoadI->getModule(); + Triple TargetTriple(M->getTargetTriple()); + if (!TargetTriple.isAArch64() || !TTI->supportsScalableVectors()) + return false; + + // TODO: add support for vector type + if (LoadI->getType()->isVectorTy()) + return false; + + auto PointerOp = LoadI->getPointerOperand(); + if (isa(PointerOp)) + return true; + + auto *PI = dyn_cast(PointerOp); + if (!PI || CurLoop->contains(PI)) + return false; + + return true; +} + + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -862,9 +966,9 @@ public: /// bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, - TargetLibraryInfo *TLI, Loop *CurLoop, - MemorySSAUpdater &MSSAU, ScalarEvolution *SE, - ICFLoopSafetyInfo *SafetyInfo, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + Loop *CurLoop, MemorySSAUpdater &MSSAU, + ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE, bool LoopNestMode, bool AllowSpeculation) { @@ -893,6 +997,11 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, continue; for (Instruction &I : llvm::make_early_inc_range(*BB)) { + bool SafeHoist = + isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, ORE, + Preheader->getTerminator(), AC, + AllowSpeculation); + // Try hoisting the instruction out to the preheader. We can only do // this if all of the operands of the instruction are loop invariant and // if it is safe to hoist the instruction. We also check block frequency @@ -902,9 +1011,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && - isSafeToExecuteUnconditionally( - I, DT, TLI, CurLoop, SafetyInfo, ORE, - Preheader->getTerminator(), AC, AllowSpeculation)) { + SafeHoist) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, MSSAU, SE, ORE); HoistedInstructions.push_back(&I); @@ -912,6 +1019,29 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, continue; } + if (!SafeHoist) { + if (findConditionalLoad(LI, CurLoop, TTI, &I)) { + LLVM_DEBUG(dbgs() << "LICM: find the conditional load: " << I << "\n"); + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && + (getSVEContainerType(I.getType()) || I.getType()->isPointerTy())) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); + // Replace hoisted load with @llvm.aarch64.sve.ldnf1.* + Instruction *ExtractI = replaceLoadWithLdnf(&I); + assert(ExtractI && "Failed to create ldnf1 to replace load"); + if (ExtractI) { + LLVM_DEBUG(dbgs() << "LICM: repalced with ldnf1: " << *ExtractI << "\n"); + I.replaceAllUsesWith(ExtractI); + eraseInstruction(I, *SafetyInfo, MSSAU); + + Changed = true; + continue; + } + } + } + } + // Attempt to remove floating point division out of the loop by // converting it to a reciprocal multiplication. if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() && -- Gitee