Skip to content

Commit

Permalink
PR #17950: Remove Post Layout Assignment Collective Pipeliner
Browse files Browse the repository at this point in the history
Imported from GitHub PR #17950

Removes the functionality to optionally run the collective pipeliner pass post layout assignment. This option was introduced in #12866 to preserve FP8 quantization and dequantization patterns and was obsoleted by the reordering of the collective optimization passes in #17453.
Copybara import of the project:

--
e9e9c1a by Philipp Hack <phack@nvidia.com>:

Removes the option to run the collective pipeliner post layout assignment.

--
c5489ac by Philipp Hack <phack@nvidia.com>:

Removes the option to run the collective pipeliner post layout assignment.

Merging this change closes #17950

FUTURE_COPYBARA_INTEGRATE_REVIEW=#17950 from philipphack:u_collective_flag_xla c5489ac
PiperOrigin-RevId: 684256309
  • Loading branch information
philipphack authored and Google-ML-Automation committed Oct 10, 2024
1 parent d05087b commit 73b6982
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 153 deletions.
8 changes: 0 additions & 8 deletions xla/debug_options_flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
opts.set_xla_gpu_enable_pipelined_reduce_scatter(true);
opts.set_xla_gpu_enable_pipelined_p2p(false);

opts.set_xla_gpu_run_post_layout_collective_pipeliner(false);

opts.set_xla_gpu_collective_permute_decomposer_threshold(
std::numeric_limits<int64_t>::max());

Expand Down Expand Up @@ -1559,12 +1557,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
bool_setter_for(&DebugOptions::set_xla_gpu_enable_pipelined_p2p),
debug_options->xla_gpu_enable_pipelined_p2p(),
"Enable pipelinling of P2P instructions."));
flag_list->push_back(tsl::Flag(
"xla_gpu_run_post_layout_collective_pipeliner",
bool_setter_for(
&DebugOptions::set_xla_gpu_run_post_layout_collective_pipeliner),
debug_options->xla_gpu_run_post_layout_collective_pipeliner(),
"Move collective pipeliner after the post-layout optimization."));
flag_list->push_back(tsl::Flag(
"xla_gpu_collective_permute_decomposer_threshold",
int64_setter_for(
Expand Down
77 changes: 26 additions & 51 deletions xla/service/gpu/gpu_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -834,8 +834,32 @@ absl::Status RunOptimizationPasses(
return pipeline.Run(hlo_module).status();
}

absl::Status AddCollectivePipelinerPasses(
const DebugOptions& debug_options, HloPassPipeline& collectives_pipeline) {
absl::Status RunCollectiveOptimizationPasses(
HloModule* hlo_module,
const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts,
se::GpuComputeCapability gpu_version) {
// Optimize collectives generated by SPMD partitioning. Enable these passes
// otherwise as well so that all collectives can get these optimizations.
const DebugOptions& debug_options = hlo_module->config().debug_options();

HloPassPipeline collectives_pipeline("collective-optimizations");
collectives_pipeline.AddPass<AllReduceFolder>();
collectives_pipeline.AddPass<AllReduceSplitter>();
collectives_pipeline.AddPass<AllGatherOptimizer>();
collectives_pipeline.AddPass<AllReduceReassociate>(
debug_options.xla_gpu_enable_reassociation_for_converted_ar());
collectives_pipeline.AddPass<ReduceScatterReassociate>();

collectives_pipeline.AddPass<WhileLoopAllReduceCodeMotion>(
/*enable_reduce_scatter=*/debug_options
.xla_gpu_enable_while_loop_reduce_scatter_code_motion());

// Moves collectives' subsequent quantization before the collective to
// minimize data transfers.
collectives_pipeline.AddPass<CollectiveQuantizer>();
// Remove dead computations after collective quantization.
collectives_pipeline.AddPass<HloDCE>();

if (debug_options.xla_gpu_enable_pipelined_collectives() ||
debug_options.xla_gpu_enable_pipelined_all_reduce()) {
CollectivePipeliner::Config config{
Expand Down Expand Up @@ -887,54 +911,7 @@ absl::Status AddCollectivePipelinerPasses(
/*reuse_pipelined_op_buffer=*/HloPredicateFalse};
collectives_pipeline.AddPass<CollectivePipeliner>(config);
}
return absl::OkStatus();
}

absl::Status RunPostLayoutCollectivePipelinerPasses(HloModule* hlo_module) {
const DebugOptions& debug_options = hlo_module->config().debug_options();
HloPassPipeline collectives_pipeline("collective-pipeliner-optimizations");
if (debug_options.xla_gpu_run_post_layout_collective_pipeliner()) {
TF_RETURN_IF_ERROR(
AddCollectivePipelinerPasses(debug_options, collectives_pipeline));
// We call WhileLoopTripCountAnnotator at the end of the collective
// pipeline, which might have changed the loop trip count.
collectives_pipeline.AddPass<WhileLoopTripCountAnnotator>();
// Flatten call graph after loop peeling.
collectives_pipeline.AddPass<FlattenCallGraph>();
}
return collectives_pipeline.Run(hlo_module).status();
}

absl::Status RunCollectiveOptimizationPasses(
HloModule* hlo_module,
const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts,
se::GpuComputeCapability gpu_version) {
// Optimize collectives generated by SPMD partitioning. Enable these passes
// otherwise as well so that all collectives can get these optimizations.
const DebugOptions& debug_options = hlo_module->config().debug_options();

HloPassPipeline collectives_pipeline("collective-optimizations");
collectives_pipeline.AddPass<AllReduceFolder>();
collectives_pipeline.AddPass<AllReduceSplitter>();
collectives_pipeline.AddPass<AllGatherOptimizer>();
collectives_pipeline.AddPass<AllReduceReassociate>(
debug_options.xla_gpu_enable_reassociation_for_converted_ar());
collectives_pipeline.AddPass<ReduceScatterReassociate>();

collectives_pipeline.AddPass<WhileLoopAllReduceCodeMotion>(
/*enable_reduce_scatter=*/debug_options
.xla_gpu_enable_while_loop_reduce_scatter_code_motion());

// Moves collectives' subsequent quantization before the collective to
// minimize data transfers.
collectives_pipeline.AddPass<CollectiveQuantizer>();
// Remove dead computations after collective quantization.
collectives_pipeline.AddPass<HloDCE>();

if (!debug_options.xla_gpu_run_post_layout_collective_pipeliner()) {
TF_RETURN_IF_ERROR(
AddCollectivePipelinerPasses(debug_options, collectives_pipeline));
}
collectives_pipeline.AddPass<ReduceScatterCreator>();

collectives_pipeline.AddPass<CollectivePermuteCycleDecomposer>(
Expand Down Expand Up @@ -1353,8 +1330,6 @@ absl::Status GpuCompiler::OptimizeHloModule(
hlo_module, stream_exec, options, gpu_target_config,
thread_pool.get_mutable()));

TF_RETURN_IF_ERROR(RunPostLayoutCollectivePipelinerPasses(hlo_module));

// This is a "low effort, high impact" fusion that should be run first.
TF_RETURN_IF_ERROR(RunDynamicSliceFusionPasses(hlo_module, PlatformId()));

Expand Down
93 changes: 0 additions & 93 deletions xla/tests/collective_ops_e2e_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1159,99 +1159,6 @@ ENTRY entry {
absl::StrReplaceAll(kModuleReplicatedStr, replacements_), opts);
}

TEST_F(CollectiveOpsTestE2E,
PostLayoutCollectivePipelinerShouldFlattenCallGraph) {
// The allgather in the loop has a nested while loop as its operand,
// when the pipelining happens, the nested while loop will be peeled outside.
// However, when a while is cloned, its call sites are still preserved which
// will error out in alias analysis. When the graph is flattened, the error
// should not happen.
absl::string_view kModuleReplicatedStr = R"(
HloModule module
while_cond {
param = (s32[], f32[2,128], f32[8,128], f32[8,128]) parameter(0)
gte = s32[] get-tuple-element(param), index=0
constant.1 = s32[] constant(3)
ROOT cmp = pred[] compare(gte, constant.1), direction=LT
}
while_nested_cond {
param.nested = (s32[], f32[2,128]) parameter(0)
gte.nested = s32[] get-tuple-element(param.nested), index=0
constant.nested = s32[] constant(3)
ROOT cmp.nested = pred[] compare(gte.nested, constant.nested), direction=LT
}
while_nested_body {
param.body_nested = (s32[], f32[2,128]) parameter(0)
gte.body_nested = s32[] get-tuple-element(param.body_nested), index=0
gte.2.body_nested = f32[2,128] get-tuple-element(param.body_nested), index=1
constant.body_nested = s32[] constant(1)
add.body_nested = s32[] add(gte.body_nested, constant.body_nested)
rsqrt.body_nested = f32[2,128] rsqrt(gte.2.body_nested)
ROOT tuple.body_nested = (s32[], f32[2,128]) tuple(add.body_nested, rsqrt.body_nested)
}
while_body {
param = (s32[], f32[2,128], f32[8,128], f32[8,128]) parameter(0)
get-tuple-element.394 = s32[] get-tuple-element(param), index=0
get-tuple-element.395 = f32[2,128] get-tuple-element(param), index=1
get-tuple-element.35 = f32[8,128] get-tuple-element(param), index=2
get-tuple-element.36 = f32[8,128] get-tuple-element(param), index=3
constant.2557 = s32[] constant(1)
add.230 = s32[] add(get-tuple-element.394, constant.2557)
mul = f32[2,128] multiply(get-tuple-element.395, get-tuple-element.395)
constant.while = s32[] constant(0)
tuple.1 = (s32[], f32[2,128]) tuple(constant.while, mul)
while.1 = (s32[], f32[2,128]) while(tuple.1), condition=while_nested_cond, body=while_nested_body
gte.while = f32[2,128] get-tuple-element(while.1), index=1
add.while = f32[2,128] add(gte.while, get-tuple-element.395)
ag.1 = f32[8,128] all-gather(add.while), replica_groups={}, dimensions={0}
add.ag = f32[8,128] add(ag.1, get-tuple-element.36)
ROOT tuple = (s32[], f32[2,128], f32[8,128], f32[8,128]) tuple(add.230, get-tuple-element.395, get-tuple-element.35, ag.1)
}
ENTRY entry {
c0 = s32[] constant(0)
p0 = f32[2,128] parameter(0)
p1 = f32[8,128] parameter(1)
tuple = (s32[], f32[2,128], f32[8,128], f32[8,128]) tuple(c0, p0, p1, p1)
while = (s32[], f32[2,128], f32[8,128], f32[8,128]) while(tuple), condition=while_cond, body=while_body
gte1 = f32[2,128] get-tuple-element(while), index=1
gte2 = f32[8,128] get-tuple-element(while), index=3
ROOT tuple.result = (f32[2,128], f32[8,128]) tuple(gte1, gte2)
}
)";

const int64_t kNumReplicas = 1;
SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas);

HloModuleConfig config =
GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
auto opts = GetDebugOptionsForTest();
opts.set_xla_gpu_run_post_layout_collective_pipeliner(true);
opts.set_xla_gpu_enable_pipelined_all_reduce(true);
opts.set_xla_gpu_enable_pipelined_all_gather(true);
opts.set_xla_gpu_enable_pipelined_reduce_scatter(true);

opts.set_xla_gpu_enable_triton_gemm(false);
config.set_debug_options(opts);
config.set_use_spmd_partitioning(false);

TF_ASSERT_OK_AND_ASSIGN(
auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));

TF_ASSERT_OK_AND_ASSIGN(auto executable,
CreateExecutable(std::move(module),
/*run_hlo_passes=*/true));
EXPECT_TRUE(executable->has_module());
}

TEST_F(CollectiveOpsTestE2E, AllToAllQuantizeCollectiveQuantizer) {
absl::string_view kModuleReplicatedStr = R"(
HloModule pjit__unnamed_wrapped_function_, entry_computation_layout={()->bf16[2]}, num_partitions=2
Expand Down
3 changes: 2 additions & 1 deletion xla/xla.proto
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,8 @@ message DebugOptions {
bool xla_gpu_enable_pipelined_all_gather = 227;
bool xla_gpu_enable_pipelined_reduce_scatter = 231;
bool xla_gpu_enable_pipelined_p2p = 246;
bool xla_gpu_run_post_layout_collective_pipeliner = 313;

reserved 313; // Was xla_gpu_run_post_layout_collective_pipeliner.

// The minimum data size in bytes to trigger collective-permute-decomposer
// transformation.
Expand Down

0 comments on commit 73b6982

Please sign in to comment.