Skip to content

Commit

Permalink
Allow skipping doneEvent inside MSCCL. (#753)
Browse files Browse the repository at this point in the history
* Skip doneEvent inside MSCCL by default.

Added a RCCL_MSCCL_ENABLE_DONE_EVENT env var, set it be 0 by default.

The env var is to control whether to use doneEvent when invoking MSCCL
kernels.

Skipping doneEvent would cause the firmware to skip L2 cache flush,
resulting in overall performance improvement.

(cherry picked from commit 12dba42)

* Address review feedbacks and make the flag be disabled by default.

(cherry picked from commit ca4a1df)

---------

Co-authored-by: Wen-Heng (Jack) Chung <whchung@gmail.com>
  • Loading branch information
wenkaidu and whchung authored May 31, 2023
1 parent 36e453c commit f7c1a0e
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions src/misc/msccl/msccl_setup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#include "msccl/msccl_setup.h"
#include "msccl/msccl_status.h"

#ifndef HIP_EVENT_DISABLE_FENCE
RCCL_PARAM(MscclEnableDoneEvent, "MSCCL_ENABLE_DONE_EVENT", 1);
#endif

ncclResult_t mscclSetupCount(struct mscclAlgo* hostAlgo, ncclComm_t comm, size_t count, ncclDataType_t dataType) {
mscclStatus& status = mscclGetStatus();
status.stepSize = comm->buffSizes[hostAlgo->protocol] / NCCL_STEPS;
Expand Down Expand Up @@ -260,7 +264,14 @@ ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count
ncclComm_t comm, hipStream_t stream) {
mscclStatus& status = mscclGetStatus();

if (status.lastStream != stream && status.lastStream != nullptr) {
bool enableDoneEvent =
#ifndef HIP_EVENT_DISABLE_FENCE
(rcclParamMscclEnableDoneEvent() == 1);
#else
true;
#endif

if (enableDoneEvent && (status.lastStream != stream && status.lastStream != nullptr)) {
CUDACHECK(hipStreamWaitEvent(stream, comm->doneEvent, 0));
}

Expand All @@ -284,7 +295,11 @@ ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count

void *args[3] = {&comm->devComm, &devAlgo, &work};
void *func = mscclKernelEntries[(opFull.op * ncclNumTypes + dataType) * NCCL_NUM_PROTOCOLS + hostAlgo->protocol];
CUDACHECK(hipExtLaunchKernel(func, grid, block, args, 0, stream, NULL, comm->doneEvent, 0));
if (enableDoneEvent) {
CUDACHECK(hipExtLaunchKernel(func, grid, block, args, 0, stream, NULL, comm->doneEvent, 0));
} else {
CUDACHECK(hipExtLaunchKernel(func, grid, block, args, 0, stream, NULL, NULL, 0));
}
status.workIndex++;
status.lastStream = stream;
return ncclSuccess;
Expand Down

0 comments on commit f7c1a0e

Please sign in to comment.