Skip to content

Commit

Permalink
Merge pull request #214 from wenkaidu/gdr
Browse files Browse the repository at this point in the history
Use cached value for detecting GDR support only once
  • Loading branch information
wenkaidu authored May 22, 2020
2 parents 957be85 + 67c8e72 commit 2a45147
Showing 1 changed file with 11 additions and 0 deletions.
11 changes: 11 additions & 0 deletions src/include/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->cl
static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
int netDevs;
NCCLCHECK(ncclNetDevices(&netDevs));
pthread_mutex_t ncclParamMutexGpuGdrSupport = PTHREAD_MUTEX_INITIALIZER;
static int gdrSupportCached[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
int cudaDev;
CUDACHECK(hipGetDevice(&cudaDev));
if (gdrSupportCached[cudaDev] != -1) {
*gdrSupport = gdrSupportCached[cudaDev];
return ncclSuccess;
}
*gdrSupport = 0;
for (int dev=0; dev<netDevs; dev++) {
// Find a net device which is GDR-capable
Expand All @@ -51,6 +59,7 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
ncclNetHandle_t handle;
void* gpuPtr = NULL;
void* mHandle = NULL;
pthread_mutex_lock(&ncclParamMutexGpuGdrSupport);
NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
NCCLCHECK(ncclNetAccept(lComm, &rComm));
Expand All @@ -67,8 +76,10 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
NCCLCHECK(ncclNetCloseRecv(rComm));
NCCLCHECK(ncclNetCloseSend(sComm));
NCCLCHECK(ncclNetCloseListen(lComm));
pthread_mutex_unlock(&ncclParamMutexGpuGdrSupport);
break;
}
gdrSupportCached[cudaDev] = *gdrSupport;
return ncclSuccess;
}

Expand Down

0 comments on commit 2a45147

Please sign in to comment.