From b826c216801537efff232b784472b4982559f913 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 16 Oct 2024 14:23:34 -0700 Subject: [PATCH] minor updates --- apps/nccl/src/allreduce.hpp | 17 +++++++---------- include/mscclpp/packet_device.hpp | 2 -- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 53add798..228b8e6b 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -43,7 +43,6 @@ template <> __forceinline__ __device__ __half2 clip(__half2 val) { val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff)); val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff)); - val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff)); val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff)); return val; @@ -242,19 +241,19 @@ __global__ void __launch_bounds__(1024, 1) size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; - + if (sizeof(T) == 2) - nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); + nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); else - nelems = nelems / (sizeof(int) / sizeof(T)); - + nelems = nelems / (sizeof(int) / sizeof(T)); + const int nPeers = nRanksPerNode - 1; - const size_t nPkts = nelems/2; - + const size_t nPkts = nelems / 2; + int nelemsPerRank = nelems / worldSize; if ((nelemsPerRank % 2)) nelemsPerRank = (nelemsPerRank * sizeof(T) + sizeof(T)) / sizeof(T); - const int nPktsPerRank = nelemsPerRank/2; + const int nPktsPerRank = nelemsPerRank / 2; // thread block & channel info const int nBlocksPerPeer = gridDim.x / nPeers; const int localBlockIdx = blockIdx.x % nBlocksPerPeer; @@ -286,9 +285,7 @@ __global__ void __launch_bounds__(1024, 1) for (int index = 0; index < NPEERS; index++) { const int remoteRank = index < rank ? index : index + 1; mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; - //uint32_t val = dstPkt[idx].read(flag, -1); uint2 val = dstPkt[idx].read(flag); - //data = add_vectors(val, data); data.x = add_vectors(val.x, data.x); data.y = add_vectors(val.y, data.y); } diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index 532676d4..d7f2ee8a 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -49,8 +49,6 @@ union alignas(16) LL16Packet { #else // !defined(MSCCLPP_DEVICE_CUDA) uint4 reg = make_uint4(val1, flag, val2, flag); ulonglong2* p = reinterpret_cast(®); - /*atomicStore(&(raw_.x), p->x, memoryOrderRelaxed); - atomicStore(&(raw_.y), p->y, memoryOrderRelaxed);*/ __builtin_nontemporal_store(p->x, &(raw_.x)); __builtin_nontemporal_store(p->y, &(raw_.y)); #endif