Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[flang][OpenMP] Implement more robust loop-nest detection logic #127

Open
wants to merge 1 commit into
base: amd-trunk-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 78 additions & 52 deletions flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ namespace flangomp {
#include "flang/Optimizer/OpenMP/Passes.h.inc"
} // namespace flangomp

#define DEBUG_TYPE "fopenmp-do-concurrent-conversion"
#define DEBUG_TYPE "do-concurrent-conversion"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")

namespace Fortran {
namespace lower {
Expand Down Expand Up @@ -366,6 +367,81 @@ void collectIndirectConstOpChain(mlir::Operation *link,
opChain.insert(link);
}

/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
/// there are no operations in \p outerloop's other than:
///
/// 1. those operations needed to setup \p innerLoop's LB, UB, and step values,
/// 2. the operations needed to assing/update \p outerLoop's induction variable.
/// 3. \p innerLoop itself.
///
/// \p return true if \p innerLoop is perfectly nested inside \p outerLoop
/// according to the above definition.
bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
mlir::BackwardSliceOptions backwardSliceOptions;
backwardSliceOptions.inclusive = true;
// We will collect the backward slices for innerLoop's LB, UB, and step.
// However, we want to limit the scope of these slices to the scope of
// outerLoop's region.
backwardSliceOptions.filter = [&](mlir::Operation *op) {
return !mlir::areValuesDefinedAbove(op->getResults(),
outerLoop.getRegion());
};

llvm::SetVector<mlir::Operation *> lbSlice;
mlir::getBackwardSlice(innerLoop.getLowerBound(), &lbSlice,
backwardSliceOptions);

llvm::SetVector<mlir::Operation *> ubSlice;
mlir::getBackwardSlice(innerLoop.getUpperBound(), &ubSlice,
backwardSliceOptions);

llvm::SetVector<mlir::Operation *> stepSlice;
mlir::getBackwardSlice(innerLoop.getStep(), &stepSlice, backwardSliceOptions);

mlir::ForwardSliceOptions forwardSliceOptions;
forwardSliceOptions.inclusive = true;
// We don't care of the outer loop's induction variable's uses within the
// inner loop, so we filter out these uses.
forwardSliceOptions.filter = [&](mlir::Operation *op) {
return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion());
};

llvm::SetVector<mlir::Operation *> indVarSlice;
mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice,
forwardSliceOptions);

llvm::SetVector<mlir::Operation *> innerLoopSetupOpsVec;
innerLoopSetupOpsVec.set_union(indVarSlice);
innerLoopSetupOpsVec.set_union(lbSlice);
innerLoopSetupOpsVec.set_union(ubSlice);
innerLoopSetupOpsVec.set_union(stepSlice);
llvm::DenseSet<mlir::Operation *> innerLoopSetupOpsSet;

for (mlir::Operation *op : innerLoopSetupOpsVec)
innerLoopSetupOpsSet.insert(op);

llvm::DenseSet<mlir::Operation *> loopBodySet;
outerLoop.walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *op) {
if (op == outerLoop)
return mlir::WalkResult::advance();

if (op == innerLoop)
return mlir::WalkResult::skip();

if (op->hasTrait<mlir::OpTrait::IsTerminator>())
return mlir::WalkResult::advance();

loopBodySet.insert(op);
return mlir::WalkResult::advance();
});

bool result = (loopBodySet == innerLoopSetupOpsSet);
mlir::Location loc = outerLoop.getLoc();
LLVM_DEBUG(DBGS() << "Loop pair starting at location " << loc << " is"
<< (result ? "" : " not") << " perfectly nested\n");
return result;
}

/// Starting with `outerLoop` collect a perfectly nested loop nest, if any. This
/// function collects as much as possible loops in the nest; it case it fails to
/// recognize a certain nested loop as part of the nest it just returns the
Expand Down Expand Up @@ -406,57 +482,7 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp outerLoop,
llvm::SmallVector<mlir::Value> nestedLiveIns;
collectLoopLiveIns(nestedUnorderedLoop, nestedLiveIns);

llvm::DenseSet<mlir::Value> outerLiveInsSet;
llvm::DenseSet<mlir::Value> nestedLiveInsSet;

// Returns a "unified" view of an mlir::Value. This utility checks if the
// value is defined by an op, and if so, return the first value defined by
// that op (if there are many), otherwise just returns the value.
//
// This serves the purpose that if, for example, `%op_res#0` is used in the
// outer loop and `%op_res#1` is used in the nested loop (or vice versa),
// that we detect both as the same value. If we did not do so, we might
// falesely detect that the 2 loops are not perfectly nested since they use
// "different" sets of values.
auto getUnifiedLiveInView = [](mlir::Value liveIn) {
return liveIn.getDefiningOp() != nullptr
? liveIn.getDefiningOp()->getResult(0)
: liveIn;
};

// Re-package both lists of live-ins into sets so that we can use set
// equality to compare the values used in the outerloop vs. the nestd one.

for (auto liveIn : nestedLiveIns)
nestedLiveInsSet.insert(getUnifiedLiveInView(liveIn));

mlir::Value outerLoopIV;
for (auto liveIn : outerLoopLiveIns) {
outerLiveInsSet.insert(getUnifiedLiveInView(liveIn));

// Keep track of the IV of the outerloop. See `isPerfectlyNested` for more
// info on the reason.
if (outerLoopIV == nullptr)
outerLoopIV = getUnifiedLiveInView(liveIn);
}

// For the 2 loops to be perfectly nested, either:
// * both would have exactly the same set of live-in values or,
// * the outer loop would have exactly 1 extra live-in value: the outer
// loop's induction variable; this happens when the outer loop's IV is
// *not* referenced in the nested loop.
bool isPerfectlyNested = [&]() {
if (outerLiveInsSet == nestedLiveInsSet)
return true;

if ((outerLiveInsSet.size() == nestedLiveIns.size() + 1) &&
!nestedLiveInsSet.contains(outerLoopIV))
return true;

return false;
}();

if (!isPerfectlyNested)
if (!isPerfectlyNested(outerLoop, nestedUnorderedLoop))
return mlir::failure();

outerLoop = nestedUnorderedLoop;
Expand Down
89 changes: 89 additions & 0 deletions flang/test/Transforms/DoConcurrent/loop_nest_test.f90
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please alter the test to not rely on debug statements for failure/success determination.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not ideal indeed. Do you have any suggestions on how to do this differently?

I thought about doing this as a unit test, but setting up the test and later reading the test I think would be more complicated than a lit test the clearly shows what the loops look like and what the outcome should be.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can a special flag to print loop info to llvm::outs(). But I am not sure this is worth it tbh.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't like relying on debug output either. It randomly interleaves with stdout and only works with assertions-builds. Additionally, it makes the test dependent on how often/in which order the isPerfectlyNested function is called internally, making it sensitive to even NFC patches. But I have seen this in LLVM and Clang enough to be consider a established pattern there, although not for MLIR/Flang.

If you do this, you still must:

  1. Do not redirect/pipe stdout and stderr at the same time. either guarentee that just one of the ones is used or 2> instead &>
  2. Add REQUIRES: asserts or it will fail in release builds

In MLIR I indeed usually see an internal option enabling additional printing. or print debug counters E.g. -test-print-shape-mapping, or a pass that just prints the analysis result, e.g. -pass-pipeline=...test-print-dominance, or test the diagnostic from -Rpass output.

Since it is a transformation pass, one would typically (also) test whether the output is what was expected.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the info @Meinersbur, really useful.

I used 2> and REQUIRES: asserts.

Since it is a transformation pass, one would typically (also) test whether the output is what was expected.

All other do-concurrent-conversion tests verify the output. I wanted this one to test only one thing which is loop-nest detection. My reasoning is that this test isolates this particular part of the pass so that we debug issues in nest detection more easily.

Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
! Tests loop-nest detection algorithm for do-concurrent mapping.

! REQUIRES: asserts

! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do-concurrent-conversion is a MLIR-to-MLIR pass. Those tests usually only contains the input MLIR of the pass so we don't test more than necessary.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did this to show loops on the Fortran source level. Just makes it easy to correlate for which loop nests do we detect that they are perfectly nested.

If you don't think this is a good enough arugment to have the test on the fortran level, I will replace it with MLIR instead.

! RUN: -mmlir -debug %s -o - 2> %t.log || true

! RUN: FileCheck %s < %t.log

program main
implicit none

contains

subroutine foo(n)
implicit none
integer :: n, m
integer :: i, j, k
integer :: x
integer, dimension(n) :: a
integer, dimension(n, n, n) :: b

! NOTE This for sure is a perfect loop nest. However, the way `do-concurrent`
! loops are now emitted by flang is probably not correct. This is being looked
! into at the moment and once we have flang emitting proper loop headers, we
! will revisit this.
!
! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
do concurrent(i=1:n, j=1:bar(n*m, n/m))
a(i) = n
end do

! NOTE same as above.
!
! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m))
a(i) = n
end do

! NOTE This is **not** a perfect nest since the inner call to `bar` will allocate
! memory for the temp results of `n*m` and `n/m` **inside** the outer loop.
!
! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
do concurrent(i=bar(n, x):n)
do concurrent(j=1:bar(n*m, n/m))
a(i) = n
end do
end do

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
do concurrent(i=1:n)
x = 10
do concurrent(j=1:m)
b(i,j,k) = i * j + k
end do
end do

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
do concurrent(i=1:n)
do concurrent(j=1:m)
b(i,j,k) = i * j + k
end do
x = 10
end do

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
do concurrent(i=1:n)
do concurrent(j=1:m)
b(i,j,k) = i * j + k
x = 10
end do
end do
end subroutine

pure function bar(n, m)
implicit none
integer, intent(in) :: n, m
integer :: bar

bar = n + m
end function

end program main