diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 991c3ac8f7446c..cbbbec0ccc8c4d 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -209,6 +209,15 @@ static cl::opt EnableLoopFlatten("enable-loop-flatten", cl::init(false), cl::Hidden, cl::desc("Enable the LoopFlatten Pass")); +// Experimentally allow loop header duplication. This should allow for better +// optimization at Oz, since loop-idiom recognition can then recognize things +// like memcpy. If this ends up being useful for many targets, we should drop +// this flag and make a code generation option that can be controlled +// independent of the opt level and exposed through the frontend. +static cl::opt EnableLoopHeaderDuplication( + "enable-loop-header-duplication", cl::init(false), cl::Hidden, + cl::desc("Enable loop header duplication at any optimization level")); + static cl::opt EnableDFAJumpThreading("enable-dfa-jump-thread", cl::desc("Enable DFA jump threading"), @@ -630,8 +639,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, /*AllowSpeculation=*/false)); // Disable header duplication in loop rotation at -Oz. - LPM1.addPass( - LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase))); + LPM1.addPass(LoopRotatePass(EnableLoopHeaderDuplication || + Level != OptimizationLevel::Oz, + isLTOPreLink(Phase))); // TODO: Investigate promotion cap for O1. LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, /*AllowSpeculation=*/true)); @@ -812,7 +822,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, // Disable header duplication in loop rotation at -Oz. MPM.addPass(createModuleToFunctionPassAdaptor( createFunctionToLoopPassAdaptor( - LoopRotatePass(Level != OptimizationLevel::Oz), + LoopRotatePass(EnableLoopHeaderDuplication || + Level != OptimizationLevel::Oz), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false), PTO.EagerlyInvalidateAnalyses)); @@ -1422,7 +1433,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, LoopPassManager LPM; // First rotate loops that may have been un-rotated by prior passes. // Disable header duplication at -Oz. - LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink)); + LPM.addPass(LoopRotatePass(EnableLoopHeaderDuplication || + Level != OptimizationLevel::Oz, + LTOPreLink)); // Some loops may have become dead by now. Try to delete them. // FIXME: see discussion in https://reviews.llvm.org/D112851, // this may need to be revisited once we run GVN before loop deletion diff --git a/llvm/test/Transforms/LoopRotate/oz-disable.ll b/llvm/test/Transforms/LoopRotate/oz-disable.ll index 6a7847ac0ff215..c45603878ee65c 100644 --- a/llvm/test/Transforms/LoopRotate/oz-disable.ll +++ b/llvm/test/Transforms/LoopRotate/oz-disable.ll @@ -4,6 +4,9 @@ ; RUN: opt < %s -S -passes='default' -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OS ; RUN: opt < %s -S -passes='default' -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OZ +;; Make sure -allow-loop-header-duplication overrides the default behavior at Oz +; RUN: opt < %s -S -passes='default' -enable-loop-header-duplication -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OS + ; Loop should be rotated for -Os but not for -Oz. ; OS: rotating Loop at depth 1 ; OZ-NOT: rotating Loop at depth 1 diff --git a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll new file mode 100644 index 00000000000000..98b11578b49fbf --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 + +;; Check that -enable-loop-header-duplication at Oz enables certain types of +;; optimizations, for example replacing the loop body w/ a call to memset. If +;; loop idiom recognition begins to recognize unrotated loops, this test will +;; need to be updated. + +; RUN: opt -passes='default' -S < %s | FileCheck %s --check-prefix=NOROTATION +; RUN: opt -passes='default' -S -enable-loop-header-duplication < %s | FileCheck %s --check-prefix=ROTATION +; RUN: opt -passes='default' -S < %s | FileCheck %s --check-prefix=ROTATION + +define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr { +; NOROTATION-LABEL: define void @test( +; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; NOROTATION-NEXT: entry: +; NOROTATION-NEXT: br label [[LOOP_HEADER:%.*]] +; NOROTATION: loop.header: +; NOROTATION-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], [[ENTRY:%.*]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; NOROTATION-NEXT: [[_12_I:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; NOROTATION-NEXT: br i1 [[_12_I]], label [[EXIT:%.*]], label [[LOOP_LATCH]] +; NOROTATION: loop.latch: +; NOROTATION-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1 +; NOROTATION-NEXT: store i8 1, ptr [[PTR_IV]], align 1 +; NOROTATION-NEXT: br label [[LOOP_HEADER]] +; NOROTATION: exit: +; NOROTATION-NEXT: ret void +; +; ROTATION-LABEL: define void @test( +; ROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; ROTATION-NEXT: entry: +; ROTATION-NEXT: [[_12_I1:%.*]] = icmp eq ptr [[START]], [[END]] +; ROTATION-NEXT: br i1 [[_12_I1]], label [[EXIT:%.*]], label [[LOOP_LATCH_PREHEADER:%.*]] +; ROTATION: loop.latch.preheader: +; ROTATION-NEXT: [[END3:%.*]] = ptrtoint ptr [[END]] to i64 +; ROTATION-NEXT: [[START4:%.*]] = ptrtoint ptr [[START]] to i64 +; ROTATION-NEXT: [[TMP0:%.*]] = sub i64 [[END3]], [[START4]] +; ROTATION-NEXT: tail call void @llvm.memset.p0.i64(ptr nonnull align 1 [[START]], i8 1, i64 [[TMP0]], i1 false) +; ROTATION-NEXT: br label [[EXIT]] +; ROTATION: exit: +; ROTATION-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %ptr.iv = phi i8* [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] + %_12.i = icmp eq i8* %ptr.iv, %end + br i1 %_12.i, label %exit, label %loop.latch + +loop.latch: + %ptr.iv.next = getelementptr inbounds i8, i8* %ptr.iv, i64 1 + store i8 1, i8* %ptr.iv, align 1 + br label %loop.header + +exit: + ret void +}