From 93a5cda770c028915db8efec274f10ac88570c53 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 8 Jul 2022 07:08:25 +0000 Subject: [PATCH 001/109] add all_reduce_group test --- src/Makefile | 2 +- src/all_reduce_group.cu | 126 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 src/all_reduce_group.cu diff --git a/src/Makefile b/src/Makefile index 2a399db..977aa02 100644 --- a/src/Makefile +++ b/src/Makefile @@ -75,7 +75,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube +BIN_FILES_LIST := all_reduce all_reduce_group all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/all_reduce_group.cu b/src/all_reduce_group.cu new file mode 100644 index 0000000..88e9b3d --- /dev/null +++ b/src/all_reduce_group.cu @@ -0,0 +1,126 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common.h" +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n"); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + ncclGroupStart(); + printf("ofccl_nccl_test group start\n"); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("ofccl_nccl_test 1st allreduce\n"); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("ofccl_nccl_test 2nd allreduce\n"); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("ofccl_nccl_test 3rd allreduce\n"); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("ofccl_nccl_test 4th allreduce\n"); + ncclGroupEnd(); + printf("ofccl_nccl_test group end\n"); + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i Date: Wed, 13 Jul 2022 09:05:00 +0000 Subject: [PATCH 002/109] simple group allreduce --- src/Makefile | 2 +- src_simple/Makefile | 95 ++ {src => src_simple}/all_reduce_group.cu | 81 +- src_simple/common_simple.cu | 1222 +++++++++++++++++++++++ src_simple/common_simple.h | 275 +++++ src_simple/nccl1_compat.h | 50 + 6 files changed, 1687 insertions(+), 38 deletions(-) create mode 100644 src_simple/Makefile rename {src => src_simple}/all_reduce_group.cu (71%) create mode 100644 src_simple/common_simple.cu create mode 100644 src_simple/common_simple.h create mode 100644 src_simple/nccl1_compat.h diff --git a/src/Makefile b/src/Makefile index 977aa02..2a399db 100644 --- a/src/Makefile +++ b/src/Makefile @@ -75,7 +75,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_reduce_group all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src_simple/Makefile b/src_simple/Makefile new file mode 100644 index 0000000..35ba3bb --- /dev/null +++ b/src_simple/Makefile @@ -0,0 +1,95 @@ +# +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +CUDA_HOME ?= /usr/local/cuda +PREFIX ?= /usr/local +VERBOSE ?= 0 +DEBUG ?= 0 + +CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include +NVCC = $(CUDA_HOME)/bin/nvcc +CUDARTLIB ?= cudart + +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ + -gencode=arch=compute_61,code=sm_61 \ + -gencode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_80,code=sm_80 \ + -gencode=arch=compute_80,code=compute_80 +else +NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ + -gencode=arch=compute_50,code=sm_50 \ + -gencode=arch=compute_60,code=sm_60 \ + -gencode=arch=compute_61,code=sm_61 \ + -gencode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_70,code=compute_70 +endif + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt + +ifeq ($(DEBUG), 0) +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else +.SILENT: +endif + +.PHONY: build clean + +BUILDDIR ?= ../build +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +ifeq ($(MPI), 1) +NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include +NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi +endif +ifeq ($(MPI_IBM),1) +NVCUFLAGS += -DMPI_SUPPORT +NVLDFLAGS += -lmpi_ibm +endif +LIBRARIES += nccl +NVLDFLAGS += $(LIBRARIES:%=-l%) + +DST_DIR := $(BUILDDIR) +SRC_FILES := $(wildcard *.cu) +OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) +BIN_FILES_LIST := all_reduce_group +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) + +build: ${BIN_FILES} + +clean: + rm -rf ${DST_DIR} + +${DST_DIR}/%.o: %.cu common_simple.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_simple.o + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + diff --git a/src/all_reduce_group.cu b/src_simple/all_reduce_group.cu similarity index 71% rename from src/all_reduce_group.cu rename to src_simple/all_reduce_group.cu index 88e9b3d..1d484d7 100644 --- a/src/all_reduce_group.cu +++ b/src_simple/all_reduce_group.cu @@ -5,8 +5,9 @@ ************************************************************************/ #include "cuda_runtime.h" -#include "common.h" +#include "common_simple.h" #include +#include void print_header() { PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); @@ -55,18 +56,20 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl } testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + static int round; ncclGroupStart(); - printf("ofccl_nccl_test group start\n"); + printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round); NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("ofccl_nccl_test 1st allreduce\n"); + printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round); NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("ofccl_nccl_test 2nd allreduce\n"); + printf("<%d> %d ofccl_nccl_test 2nd allreduce\n", getpid(), round); NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("ofccl_nccl_test 3rd allreduce\n"); + printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round); NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("ofccl_nccl_test 4th allreduce\n"); + printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round); ncclGroupEnd(); - printf("ofccl_nccl_test group end\n"); + printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round); + round++; return testSuccess; } @@ -85,36 +88,40 @@ void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, in testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { args->collTest = &allReduceTest; - ncclDataType_t *run_types; - ncclRedOp_t *run_ops; - const char **run_typenames, **run_opnames; - int type_count, op_count; - - if ((int)type != -1) { - type_count = 1; - run_types = &type; - run_typenames = &typeName; - } else { - type_count = test_typenum; - run_types = test_types; - run_typenames = test_typenames; - } - - if ((int)op != -1) { - op_count = 1; - run_ops = &op; - run_opnames = &opName; - } else { - op_count = test_opnum; - run_ops = test_ops; - run_opnames = test_opnames; - } - - for (int i=0; i %d ofccl_nccl_test invoke TimeTest\n", getpid(), test_round); + test_round++; + TESTCHECK(TimeTest(args, ncclFloat, "float", ncclSum, "sum", -1)); return testSuccess; } diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu new file mode 100644 index 0000000..d1e5622 --- /dev/null +++ b/src_simple/common_simple.cu @@ -0,0 +1,1222 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "common_simple.h" +#include +#include +#include +#include +#include "cuda.h" + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +#if NCCL_MAJOR >= 2 + ncclDataType_t test_types[ncclNumTypes] = { + ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclBfloat16 + #endif + }; + const char *test_typenames[ncclNumTypes] = { + "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , "bfloat16" + #endif + }; + int test_typenum = -1; + + const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclAvg + #endif + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand + #endif + }; + int test_opnum = -1; +#else + ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; + const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; + int test_typenum = 7; + const char *test_opnames[] = {"sum", "prod", "max", "min"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; + int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32*1024*1024; +static size_t maxBytes = 32*1024*1024; +static size_t stepBytes = 1*1024*1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = 1; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024*1024*1024; + break; + case 'M': + case 'm': + units = 1024*1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch(type) { + case ncclHalf: return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: return 1e-2; +#endif + case ncclFloat: return 1e-5; + case ncclDouble: return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + //case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: return 1e-200; + } + return 1e-200; +} + +template __device__ +double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template<> __device__ +double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y-x)); +} + +template __device__ +float toFloat(T a) { + return (float)a; +} +template<> __device__ +float toFloat(half a) { + return __half2float(a); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> __device__ +float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template __global__ +void deltaKern(void* A_, void* B_, size_t count, double* max) { + const T* A = (const T*)A_; + const T* B = (const T*)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x*blockDim.x + threadIdx.x; + double locmax = 0.0; + for(size_t i=tid; i locmax ) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for(int stride = BSIZE/2; stride > 1; stride>>=1) { + __syncthreads(); + if( tid < stride ) + temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; + } + __syncthreads(); + if( threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep+rank+offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template<> +__device__ double testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(double)testValue(offset, rep, rank)); +} +template<> +__device__ float testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(float)testValue(offset, rep, rank)); +} +template<> +__device__ half testValue(const size_t offset, const int rep, const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template +__device__ T ncclOpSum(T a, T b) { return a+b; } +template +__device__ T ncclOpProd(T a, T b) { return a*b; } +template +__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } +template +__device__ T ncclOpMin(T a, T b) { return a +__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } +template<> +__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } +template<> +__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } +template<> +__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } + +template +__device__ T ncclPPOpIdent(T x, int arg) { return x; } +template +__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } +template +__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } +template<> +__device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x)*float(arg)); +} +template<> +__device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x)/n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x)*float(arg)); +} +template<> +__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x)/n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { + return 1 + rank%2; +} + +template +__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i=1; i(o+offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); +} + +static void* const initDataKerns[ncclNumTypes] = { + (void*)InitDataKernel< int8_t>, + (void*)InitDataKernel< uint8_t>, + (void*)InitDataKernel< int32_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< int64_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< half>, + (void*)InitDataKernel< float>, + (void*)InitDataKernel< double>, +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + (void*)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { + T* ptr = (T*)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs* args) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + args->barrier_idx=!args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs* args, double* value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) val += val2; + if (average == 2) val = std::min(val, val2); + if (average == 3) val = std::max(val, val2); + } + if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; + args->reduce[1-args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx=!args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int* done = (int*)malloc(sizeof(int)*ngpus); + memset(done, 0, sizeof(int)*ngpus); + while (remaining) { + int idle = 1; + for (int i=0; i= NCCL_VERSION(2,4,0) + if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i=0; inbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange + size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) { + // printf("startColl, args->nGpus > 1 run ncclGroupStart\n"); + NCCLCHECK(ncclGroupStart()); + } + for (int i = 0; i < args->nGpus; i++) { +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + char* recvBuff = ((char*)args->recvbuffs[i]) + shift; + char* sendBuff = ((char*)args->sendbuffs[i]) + shift; + ncclRedOp_t op; + + if(opIndex < ncclNumOps) { + op = opIndex; + } + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + else { + union { + int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; + half f16; float f32; double f64; + #if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; + #endif + }; + int scalar = preMulScalar(rank); + switch(type) { + case ncclInt8: i8 = int8_t(scalar); break; + case ncclUint8: u8 = uint8_t(scalar); break; + case ncclInt32: i32 = int32_t(scalar); break; + case ncclUint32: u32 = uint32_t(scalar); break; + case ncclInt64: i64 = int32_t(scalar); break; + case ncclUint64: u64 = uint32_t(scalar); break; + case ncclFloat16: f16 = __float2half(float(scalar)); break; + case ncclFloat32: f32 = float(scalar); break; + case ncclFloat64: f64 = double(scalar); break; + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; + #endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); + } + #endif + + TESTCHECK(args->collTest->runColl( + (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), + (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), + count, type, op, root, args->comms[i], args->streams[i])); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + if(opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); + } + #endif + } + if (args->nGpus > 1) { + // printf("startColl, args->nGpus > 1 run ncclGroupEnd\n"); + NCCLCHECK(ncclGroupEnd()); + } + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs* args) { + if (blocking_coll) return testSuccess; + + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { + size_t count = args->nbytes / wordSize(type); + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); + } + +// // Sync +// TESTCHECK(startColl(args, type, op, root, in_place, 0)); +// TESTCHECK(completeColl(args)); + +// Barrier(args); + +// #if CUDART_VERSION >= 11030 +// cudaGraph_t graphs[args->nGpus]; +// cudaGraphExec_t graphExec[args->nGpus]; +// if (cudaGraphLaunches >= 1) { +// // Begin cuda graph capture +// for (int i=0; inGpus; i++) { +// // Thread local mode is needed for: +// // - Multi-thread mode +// // - P2P pre-connect +// CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); +// } +// } +// #endif + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < iters; iter++) { + if (agg_iters>1) NCCLCHECK(ncclGroupStart()); + for (int aiter = 0; aiter < agg_iters; aiter++) { + TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); + } + if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); + } + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// // End cuda graph capture +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); +// } +// // Instantiate cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); +// } +// // Resync CPU, restart timing, launch cuda graph +// Barrier(args); +// start = std::chrono::high_resolution_clock::now(); +// for (int l=0; lnGpus; i++) { +// CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); +// } +// } +// } +// #endif + + TESTCHECK(completeColl(args)); + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec/(iters*agg_iters); + if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + Allreduce(args, &deltaSec, average); + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// //destroy cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphExecDestroy(graphExec[i])); +// CUDACHECK(cudaGraphDestroy(graphs[i])); +// } +// } +// #endif + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); + + Barrier(args); + + double maxDelta = 0; + static __thread int rep = 0; + rep++; + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// // Begin cuda graph capture for data check +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); +// } +// } +// #endif + + //test validation in single itertion, should ideally be included into the multi-iteration run + // TESTCHECK(startColl(args, type, op, root, in_place, 0)); + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// // End cuda graph capture +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); +// } +// // Instantiate cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); +// } +// // Launch cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); +// } +// } +// #endif + + // TESTCHECK(completeColl(args)); + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// //destroy cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphExecDestroy(graphExec[i])); +// CUDACHECK(cudaGraphDestroy(graphs[i])); +// } +// } +// #endif + + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec*1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { + int nranks = args->nProcs*args->nGpus*args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { + // // Warm-up for large size + // setupArgs(args->maxbytes, type, args); + // for (int iter = 0; iter < warmup_iters; iter++) { + // TESTCHECK(startColl(args, type, op, root, 0, iter)); + // } + // TESTCHECK(completeColl(args)); + + // // Warm-up for small size + // setupArgs(args->minbytes, type, args); + // for (int iter = 0; iter < warmup_iters; iter++) { + // TESTCHECK(startColl(args, type, op, root, 0, iter)); + // } + // TESTCHECK(completeColl(args)); + + // Benchmark + for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { + setupArgs(size, type, args); + print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0)); + // TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); + } + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs* args) { + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs* args) { + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + //set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i=0; inGpus; i++) { + int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i=0; inGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void* threadLauncher(void* thread_) { + struct testThread* thread = (struct testThread*)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread* thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char* argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + ncclGetVersion(&test_ncclVersion); + #else + test_ncclVersion = NCCL_VERSION_CODE; + #endif + //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { + test_opnum++; // ncclAvg + #if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 + #endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { + test_opnum++; // PreMulSum + } + #endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {} + }; + + while(1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); + + if (c == -1) + break; + + switch(c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, + (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); + for (int p=0; p 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); + if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i=0; i memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); + } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus*nThreads]; + void* sendbuffs[nGpus*nThreads]; + void* recvbuffs[nGpus*nThreads]; + void* expected[nGpus*nThreads]; + size_t sendBytes, recvBytes; + + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); + + for (int i=0; i=0; t--) { + threads[t].args.minbytes=minBytes; + threads[t].args.maxbytes=maxBytes; + threads[t].args.stepbytes=stepBytes; + threads[t].args.stepfactor=stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs=nProcs; + threads[t].args.proc=proc; + threads[t].args.nThreads=nThreads; + threads[t].args.thread=t; + threads[t].args.nGpus=nGpus; + threads[t].args.sendbuffs = sendbuffs+t*nGpus; + threads[t].args.recvbuffs = recvbuffs+t*nGpus; + threads[t].args.expected = expected+t*nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms=comms+t*nGpus; + threads[t].args.streams=streams+t*nGpus; + + threads[t].args.barrier = (volatile int*)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double*)reduce; + threads[t].args.sync = (volatile int*)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t*NUM_BLOCKS); + threads[t].args.errors=errors+t; + threads[t].args.bw=bw+t; + threads[t].args.bw_count=bw_count+t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads+t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t=nThreads-1; t>=0; t--) { + if (t) pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for(int i=0; i +#include +#include +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +#include +#include "nccl1_compat.h" + +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test CUDA failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ + return testCudaError; \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ + return testNcclError; \ + } \ +} while(0) + +typedef enum { + testSuccess = 0, + testInternalError = 1, + testCudaError = 2, + testNcclError = 3, +} testResult_t; + +// Relay errors up and trace +#define TESTCHECK(cmd) do { \ + testResult_t r = cmd; \ + if (r!= testSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf(" .. %s pid %d: Test failure %s:%d\n", \ + hostname, getpid(), \ + __FILE__,__LINE__); \ + return r; \ + } \ +} while(0) + +struct testColl { + const char name[20]; + void (*getCollByteCount)( + size_t *sendcount, size_t *recvcount, size_t *paramcount, + size_t *sendInplaceOffset, size_t *recvInplaceOffset, + size_t count, int nranks); + testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, + ncclRedOp_t op, int root, int rep, int in_place); + void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +}; +extern struct testColl allReduceTest; +extern struct testColl allGatherTest; +extern struct testColl reduceScatterTest; +extern struct testColl broadcastTest; +extern struct testColl reduceTest; +extern struct testColl alltoAllTest; + +struct testEngine { + void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); + testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, + const char* typeName, ncclRedOp_t op, const char* opName); +}; + +extern struct testEngine ncclTestEngine; + +struct threadArgs { + size_t nbytes; + size_t minbytes; + size_t maxbytes; + size_t stepbytes; + size_t stepfactor; + + int nProcs; + int proc; + int nThreads; + int thread; + int nGpus; + int localRank; + void** sendbuffs; + size_t sendBytes; + size_t sendInplaceOffset; + void** recvbuffs; + size_t recvInplaceOffset; + ncclUniqueId ncclId; + ncclComm_t* comms; + cudaStream_t* streams; + + void** expected; + size_t expectedBytes; + volatile int* sync; + int sync_idx; + volatile int* barrier; + int barrier_idx; + volatile double* reduce; + int syncRank; + int syncNranks; + double* deltaHost; + int* errors; + double* bw; + int* bw_count; + + int reportErrors; + + struct testColl* collTest; +}; + +typedef testResult_t (*threadFunc_t)(struct threadArgs* args); +struct testThread { + pthread_t thread; + threadFunc_t func; + struct threadArgs args; + testResult_t ret; +}; + +#include + +// Provided by common.cu +extern void Barrier(struct threadArgs* args); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); +extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); + +// Provided by each coll +extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); +extern void print_header(); + +#include + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + +#include + +static uint64_t getHash(const char* string, size_t n) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (size_t c = 0; c < n; c++) { + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + +static size_t wordSize(ncclDataType_t type) { + switch(type) { + case ncclChar: +#if NCCL_MAJOR >= 2 + //case ncclInt8: + case ncclUint8: +#endif + return 1; + case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif + //case ncclFloat16: + return 2; + case ncclInt: + case ncclFloat: +#if NCCL_MAJOR >= 2 + //case ncclInt32: + case ncclUint32: + //case ncclFloat32: +#endif + return 4; + case ncclInt64: + case ncclUint64: + case ncclDouble: + //case ncclFloat64: + return 8; + default: return 0; + } +} + +extern int test_ncclVersion; // init'd with ncclGetVersion() +constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); +extern int test_opnum; +extern int test_typenum; +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[]; +extern const char *test_opnames[]; + +static int ncclstringtotype(char *str) { + for (int t=0; t INT_MAX) return ncclInvalidArgument; + +static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclBcast(buff, (int)count, datatype, root, comm, stream); +} +static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + CHECKCOUNT(recvcount); + return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(sendcount); + return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif From 06c97daf8c8b4c3d1cb4d9952eadf9200ce0ed4e Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 14 Jul 2022 12:10:11 +0000 Subject: [PATCH 003/109] nccl group bigger --- src_simple/all_reduce_group.cu | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src_simple/all_reduce_group.cu b/src_simple/all_reduce_group.cu index 1d484d7..9a702ec 100644 --- a/src_simple/all_reduce_group.cu +++ b/src_simple/all_reduce_group.cu @@ -59,6 +59,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, nccl static int round; ncclGroupStart(); printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round); NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); @@ -67,6 +68,15 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, nccl printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round); NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 5th allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 6th allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 7th allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 8th allreduce\n", getpid(), round); + ncclGroupEnd(); printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round); round++; From 560f9eb298164312b7c59036c8029dc1a85fb912 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 16 Jul 2022 04:18:43 +0000 Subject: [PATCH 004/109] log for ncclGroupStart/End --- src/nccl1_compat.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h index 020a4bc..32f04e6 100644 --- a/src/nccl1_compat.h +++ b/src/nccl1_compat.h @@ -3,7 +3,7 @@ * * See LICENSE.txt for license information ************************************************************************/ - +#include #ifndef NCCL1_COMPAT_H #define NCCL1_COMPAT_H @@ -14,8 +14,8 @@ #define ncclNumOps nccl_NUM_OPS #define ncclNumTypes nccl_NUM_TYPES -static ncclResult_t ncclGroupStart() { return ncclSuccess; } -static ncclResult_t ncclGroupEnd() { return ncclSuccess; } +static ncclResult_t ncclGroupStart() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; } +static ncclResult_t ncclGroupEnd() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; } #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument; From abe8f5c27ae9865cef91e305e0c5107bc158e757 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 16 Jul 2022 04:34:50 +0000 Subject: [PATCH 005/109] add non group all_reduce in simple --- src_simple/Makefile | 2 +- src_simple/all_reduce_simple.cu | 114 ++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 src_simple/all_reduce_simple.cu diff --git a/src_simple/Makefile b/src_simple/Makefile index 35ba3bb..86267b2 100644 --- a/src_simple/Makefile +++ b/src_simple/Makefile @@ -75,7 +75,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce_group +BIN_FILES_LIST := all_reduce_group all_reduce_simple BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src_simple/all_reduce_simple.cu b/src_simple/all_reduce_simple.cu new file mode 100644 index 0000000..bdeeb48 --- /dev/null +++ b/src_simple/all_reduce_simple.cu @@ -0,0 +1,114 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common_simple.h" + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i Date: Sat, 16 Jul 2022 04:39:37 +0000 Subject: [PATCH 006/109] half ofccl_all_reduce --- src_simple/Makefile | 2 +- src_simple/ofccl_all_reduce.cu | 143 +++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 src_simple/ofccl_all_reduce.cu diff --git a/src_simple/Makefile b/src_simple/Makefile index 35ba3bb..5e56588 100644 --- a/src_simple/Makefile +++ b/src_simple/Makefile @@ -75,7 +75,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce_group +BIN_FILES_LIST := all_reduce_group ofccl_all_reduce BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu new file mode 100644 index 0000000..9a702ec --- /dev/null +++ b/src_simple/ofccl_all_reduce.cu @@ -0,0 +1,143 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common_simple.h" +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n"); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + static int round; + ncclGroupStart(); + printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round); + + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 2nd allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 5th allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 6th allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 7th allreduce\n", getpid(), round); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> %d ofccl_nccl_test 8th allreduce\n", getpid(), round); + + ncclGroupEnd(); + printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round); + round++; + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + // ncclDataType_t *run_types; + // ncclRedOp_t *run_ops; + // const char **run_typenames, **run_opnames; + // int type_count, op_count; + + // if ((int)type != -1) { + // type_count = 1; + // run_types = &type; + // run_typenames = &typeName; + // } else { + // type_count = test_typenum; + // run_types = test_types; + // run_typenames = test_typenames; + // } + + // if ((int)op != -1) { + // op_count = 1; + // run_ops = &op; + // run_opnames = &opName; + // } else { + // op_count = test_opnum; + // run_ops = test_ops; + // run_opnames = test_opnames; + // } + + // for (int i=0; i %d ofccl_nccl_test invoke TimeTest\n", getpid(), test_round); + test_round++; + TESTCHECK(TimeTest(args, ncclFloat, "float", ncclSum, "sum", -1)); + return testSuccess; +} + +struct testEngine allReduceEngine = { + AllReduceGetBuffSize, + AllReduceRunTest +}; + +#pragma weak ncclTestEngine=allReduceEngine From b8a749a96776893c5c503780233163ac2058abd3 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 16 Jul 2022 09:15:44 +0000 Subject: [PATCH 007/109] Simple not really necessary, yet no harm to keep it --- src_simple/common_simple.cu | 137 +--- src_simple/common_simple.cu.pure | 1216 +++++++++++++++++++++++++++ src_simple/common_simple.cu.simple | 1222 ++++++++++++++++++++++++++++ 3 files changed, 2460 insertions(+), 115 deletions(-) create mode 100644 src_simple/common_simple.cu.pure create mode 100644 src_simple/common_simple.cu.simple diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index d1e5622..0d88bb3 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -518,7 +518,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t size_t shift = totalnbytes * (iter % steps); if (args->nGpus > 1) { - // printf("startColl, args->nGpus > 1 run ncclGroupStart\n"); + printf("\nstartColl, args->nGpus > 1 run ncclGroupStart\n"); NCCLCHECK(ncclGroupStart()); } for (int i = 0; i < args->nGpus; i++) { @@ -575,7 +575,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #endif } if (args->nGpus > 1) { - // printf("startColl, args->nGpus > 1 run ncclGroupEnd\n"); + printf("\nstartColl, args->nGpus > 1 run ncclGroupEnd\n"); NCCLCHECK(ncclGroupEnd()); } @@ -601,25 +601,25 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); } -// // Sync -// TESTCHECK(startColl(args, type, op, root, in_place, 0)); -// TESTCHECK(completeColl(args)); - -// Barrier(args); - -// #if CUDART_VERSION >= 11030 -// cudaGraph_t graphs[args->nGpus]; -// cudaGraphExec_t graphExec[args->nGpus]; -// if (cudaGraphLaunches >= 1) { -// // Begin cuda graph capture -// for (int i=0; inGpus; i++) { -// // Thread local mode is needed for: -// // - Multi-thread mode -// // - P2P pre-connect -// CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); -// } -// } -// #endif + // Sync + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + TESTCHECK(completeColl(args)); + + Barrier(args); + +#if CUDART_VERSION >= 11030 + cudaGraph_t graphs[args->nGpus]; + cudaGraphExec_t graphExec[args->nGpus]; + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture + for (int i=0; inGpus; i++) { + // Thread local mode is needed for: + // - Multi-thread mode + // - P2P pre-connect + CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); + } + } +#endif // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); @@ -631,27 +631,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); } -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// // End cuda graph capture -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); -// } -// // Instantiate cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); -// } -// // Resync CPU, restart timing, launch cuda graph -// Barrier(args); -// start = std::chrono::high_resolution_clock::now(); -// for (int l=0; lnGpus; i++) { -// CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); -// } -// } -// } -// #endif - TESTCHECK(completeColl(args)); auto delta = std::chrono::high_resolution_clock::now() - start; @@ -660,15 +639,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; Allreduce(args, &deltaSec, average); -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// //destroy cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphExecDestroy(graphExec[i])); -// CUDACHECK(cudaGraphDestroy(graphs[i])); -// } -// } -// #endif double algBw, busBw; args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); @@ -678,56 +648,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t double maxDelta = 0; static __thread int rep = 0; rep++; - if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); - -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// // Begin cuda graph capture for data check -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); -// } -// } -// #endif - - //test validation in single itertion, should ideally be included into the multi-iteration run - // TESTCHECK(startColl(args, type, op, root, in_place, 0)); - -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// // End cuda graph capture -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); -// } -// // Instantiate cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); -// } -// // Launch cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); -// } -// } -// #endif - - // TESTCHECK(completeColl(args)); - -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// //destroy cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphExecDestroy(graphExec[i])); -// CUDACHECK(cudaGraphDestroy(graphs[i])); -// } -// } -// #endif - - TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); - - //aggregate delta from all threads and procs - Allreduce(args, &maxDelta, 3); - } + double timeUsec = deltaSec*1.0E6; char timeStr[100]; @@ -764,26 +685,12 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { } testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { - // // Warm-up for large size - // setupArgs(args->maxbytes, type, args); - // for (int iter = 0; iter < warmup_iters; iter++) { - // TESTCHECK(startColl(args, type, op, root, 0, iter)); - // } - // TESTCHECK(completeColl(args)); - - // // Warm-up for small size - // setupArgs(args->minbytes, type, args); - // for (int iter = 0; iter < warmup_iters; iter++) { - // TESTCHECK(startColl(args, type, op, root, 0, iter)); - // } - // TESTCHECK(completeColl(args)); // Benchmark for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); TESTCHECK(BenchTime(args, type, op, root, 0)); - // TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); } return testSuccess; diff --git a/src_simple/common_simple.cu.pure b/src_simple/common_simple.cu.pure new file mode 100644 index 0000000..c25c0e3 --- /dev/null +++ b/src_simple/common_simple.cu.pure @@ -0,0 +1,1216 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "common_simple.h" +#include +#include +#include +#include +#include "cuda.h" + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +#if NCCL_MAJOR >= 2 + ncclDataType_t test_types[ncclNumTypes] = { + ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclBfloat16 + #endif + }; + const char *test_typenames[ncclNumTypes] = { + "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , "bfloat16" + #endif + }; + int test_typenum = -1; + + const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclAvg + #endif + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand + #endif + }; + int test_opnum = -1; +#else + ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; + const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; + int test_typenum = 7; + const char *test_opnames[] = {"sum", "prod", "max", "min"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; + int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32*1024*1024; +static size_t maxBytes = 32*1024*1024; +static size_t stepBytes = 1*1024*1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = 1; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024*1024*1024; + break; + case 'M': + case 'm': + units = 1024*1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch(type) { + case ncclHalf: return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: return 1e-2; +#endif + case ncclFloat: return 1e-5; + case ncclDouble: return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + //case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: return 1e-200; + } + return 1e-200; +} + +template __device__ +double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template<> __device__ +double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y-x)); +} + +template __device__ +float toFloat(T a) { + return (float)a; +} +template<> __device__ +float toFloat(half a) { + return __half2float(a); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> __device__ +float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template __global__ +void deltaKern(void* A_, void* B_, size_t count, double* max) { + const T* A = (const T*)A_; + const T* B = (const T*)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x*blockDim.x + threadIdx.x; + double locmax = 0.0; + for(size_t i=tid; i locmax ) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for(int stride = BSIZE/2; stride > 1; stride>>=1) { + __syncthreads(); + if( tid < stride ) + temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; + } + __syncthreads(); + if( threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep+rank+offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template<> +__device__ double testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(double)testValue(offset, rep, rank)); +} +template<> +__device__ float testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(float)testValue(offset, rep, rank)); +} +template<> +__device__ half testValue(const size_t offset, const int rep, const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template +__device__ T ncclOpSum(T a, T b) { return a+b; } +template +__device__ T ncclOpProd(T a, T b) { return a*b; } +template +__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } +template +__device__ T ncclOpMin(T a, T b) { return a +__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } +template<> +__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } +template<> +__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } +template<> +__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } + +template +__device__ T ncclPPOpIdent(T x, int arg) { return x; } +template +__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } +template +__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } +template<> +__device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x)*float(arg)); +} +template<> +__device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x)/n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x)*float(arg)); +} +template<> +__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x)/n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { + return 1 + rank%2; +} + +template +__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i=1; i(o+offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); +} + +static void* const initDataKerns[ncclNumTypes] = { + (void*)InitDataKernel< int8_t>, + (void*)InitDataKernel< uint8_t>, + (void*)InitDataKernel< int32_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< int64_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< half>, + (void*)InitDataKernel< float>, + (void*)InitDataKernel< double>, +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + (void*)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { + T* ptr = (T*)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs* args) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + args->barrier_idx=!args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs* args, double* value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) val += val2; + if (average == 2) val = std::min(val, val2); + if (average == 3) val = std::max(val, val2); + } + if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; + args->reduce[1-args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx=!args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int* done = (int*)malloc(sizeof(int)*ngpus); + memset(done, 0, sizeof(int)*ngpus); + while (remaining) { + int idle = 1; + for (int i=0; i= NCCL_VERSION(2,4,0) + if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i=0; inbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange + size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + char* recvBuff = ((char*)args->recvbuffs[i]) + shift; + char* sendBuff = ((char*)args->sendbuffs[i]) + shift; + ncclRedOp_t op; + + if(opIndex < ncclNumOps) { + op = opIndex; + } + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + else { + union { + int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; + half f16; float f32; double f64; + #if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; + #endif + }; + int scalar = preMulScalar(rank); + switch(type) { + case ncclInt8: i8 = int8_t(scalar); break; + case ncclUint8: u8 = uint8_t(scalar); break; + case ncclInt32: i32 = int32_t(scalar); break; + case ncclUint32: u32 = uint32_t(scalar); break; + case ncclInt64: i64 = int32_t(scalar); break; + case ncclUint64: u64 = uint32_t(scalar); break; + case ncclFloat16: f16 = __float2half(float(scalar)); break; + case ncclFloat32: f32 = float(scalar); break; + case ncclFloat64: f64 = double(scalar); break; + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; + #endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); + } + #endif + + TESTCHECK(args->collTest->runColl( + (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), + (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), + count, type, op, root, args->comms[i], args->streams[i])); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + if(opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); + } + #endif + } + if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd()); + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs* args) { + if (blocking_coll) return testSuccess; + + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { + size_t count = args->nbytes / wordSize(type); + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); + } + + // Sync + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + TESTCHECK(completeColl(args)); + + Barrier(args); + +#if CUDART_VERSION >= 11030 + cudaGraph_t graphs[args->nGpus]; + cudaGraphExec_t graphExec[args->nGpus]; + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture + for (int i=0; inGpus; i++) { + // Thread local mode is needed for: + // - Multi-thread mode + // - P2P pre-connect + CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); + } + } +#endif + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < iters; iter++) { + if (agg_iters>1) NCCLCHECK(ncclGroupStart()); + for (int aiter = 0; aiter < agg_iters; aiter++) { + TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); + } + if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); + } + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + // End cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); + } + // Instantiate cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + } + // Resync CPU, restart timing, launch cuda graph + Barrier(args); + start = std::chrono::high_resolution_clock::now(); + for (int l=0; lnGpus; i++) { + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); + } + } + } +#endif + + TESTCHECK(completeColl(args)); + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec/(iters*agg_iters); + if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + Allreduce(args, &deltaSec, average); + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + //destroy cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); + } + } +#endif + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); + + Barrier(args); + + double maxDelta = 0; + static __thread int rep = 0; + rep++; + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture for data check + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); + } + } +#endif + + //test validation in single itertion, should ideally be included into the multi-iteration run + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + // End cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); + } + // Instantiate cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + } + // Launch cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); + } + } +#endif + + TESTCHECK(completeColl(args)); + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + //destroy cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); + } + } +#endif + + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec*1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { + int nranks = args->nProcs*args->nGpus*args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { + // Warm-up for large size + setupArgs(args->maxbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + TESTCHECK(startColl(args, type, op, root, 0, iter)); + } + TESTCHECK(completeColl(args)); + + // Warm-up for small size + setupArgs(args->minbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + TESTCHECK(startColl(args, type, op, root, 0, iter)); + } + TESTCHECK(completeColl(args)); + + // Benchmark + for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { + setupArgs(size, type, args); + print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0)); + TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); + } + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs* args) { + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs* args) { + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + //set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i=0; inGpus; i++) { + int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i=0; inGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void* threadLauncher(void* thread_) { + struct testThread* thread = (struct testThread*)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread* thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char* argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + ncclGetVersion(&test_ncclVersion); + #else + test_ncclVersion = NCCL_VERSION_CODE; + #endif + //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { + test_opnum++; // ncclAvg + #if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 + #endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { + test_opnum++; // PreMulSum + } + #endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {} + }; + + while(1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); + + if (c == -1) + break; + + switch(c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, + (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); + for (int p=0; p 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); + if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i=0; i memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); + } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus*nThreads]; + void* sendbuffs[nGpus*nThreads]; + void* recvbuffs[nGpus*nThreads]; + void* expected[nGpus*nThreads]; + size_t sendBytes, recvBytes; + + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); + + for (int i=0; i=0; t--) { + threads[t].args.minbytes=minBytes; + threads[t].args.maxbytes=maxBytes; + threads[t].args.stepbytes=stepBytes; + threads[t].args.stepfactor=stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs=nProcs; + threads[t].args.proc=proc; + threads[t].args.nThreads=nThreads; + threads[t].args.thread=t; + threads[t].args.nGpus=nGpus; + threads[t].args.sendbuffs = sendbuffs+t*nGpus; + threads[t].args.recvbuffs = recvbuffs+t*nGpus; + threads[t].args.expected = expected+t*nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms=comms+t*nGpus; + threads[t].args.streams=streams+t*nGpus; + + threads[t].args.barrier = (volatile int*)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double*)reduce; + threads[t].args.sync = (volatile int*)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t*NUM_BLOCKS); + threads[t].args.errors=errors+t; + threads[t].args.bw=bw+t; + threads[t].args.bw_count=bw_count+t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads+t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t=nThreads-1; t>=0; t--) { + if (t) pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for(int i=0; i +#include +#include +#include +#include "cuda.h" + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +#if NCCL_MAJOR >= 2 + ncclDataType_t test_types[ncclNumTypes] = { + ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclBfloat16 + #endif + }; + const char *test_typenames[ncclNumTypes] = { + "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , "bfloat16" + #endif + }; + int test_typenum = -1; + + const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclAvg + #endif + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand + #endif + }; + int test_opnum = -1; +#else + ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; + const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; + int test_typenum = 7; + const char *test_opnames[] = {"sum", "prod", "max", "min"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; + int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32*1024*1024; +static size_t maxBytes = 32*1024*1024; +static size_t stepBytes = 1*1024*1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = 1; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024*1024*1024; + break; + case 'M': + case 'm': + units = 1024*1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch(type) { + case ncclHalf: return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: return 1e-2; +#endif + case ncclFloat: return 1e-5; + case ncclDouble: return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + //case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: return 1e-200; + } + return 1e-200; +} + +template __device__ +double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template<> __device__ +double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y-x)); +} + +template __device__ +float toFloat(T a) { + return (float)a; +} +template<> __device__ +float toFloat(half a) { + return __half2float(a); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> __device__ +float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template __global__ +void deltaKern(void* A_, void* B_, size_t count, double* max) { + const T* A = (const T*)A_; + const T* B = (const T*)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x*blockDim.x + threadIdx.x; + double locmax = 0.0; + for(size_t i=tid; i locmax ) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for(int stride = BSIZE/2; stride > 1; stride>>=1) { + __syncthreads(); + if( tid < stride ) + temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; + } + __syncthreads(); + if( threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep+rank+offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template<> +__device__ double testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(double)testValue(offset, rep, rank)); +} +template<> +__device__ float testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(float)testValue(offset, rep, rank)); +} +template<> +__device__ half testValue(const size_t offset, const int rep, const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template +__device__ T ncclOpSum(T a, T b) { return a+b; } +template +__device__ T ncclOpProd(T a, T b) { return a*b; } +template +__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } +template +__device__ T ncclOpMin(T a, T b) { return a +__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } +template<> +__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } +template<> +__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } +template<> +__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } + +template +__device__ T ncclPPOpIdent(T x, int arg) { return x; } +template +__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } +template +__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } +template<> +__device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x)*float(arg)); +} +template<> +__device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x)/n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x)*float(arg)); +} +template<> +__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x)/n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { + return 1 + rank%2; +} + +template +__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i=1; i(o+offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); +} + +static void* const initDataKerns[ncclNumTypes] = { + (void*)InitDataKernel< int8_t>, + (void*)InitDataKernel< uint8_t>, + (void*)InitDataKernel< int32_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< int64_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< half>, + (void*)InitDataKernel< float>, + (void*)InitDataKernel< double>, +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + (void*)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { + T* ptr = (T*)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs* args) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + args->barrier_idx=!args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs* args, double* value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) val += val2; + if (average == 2) val = std::min(val, val2); + if (average == 3) val = std::max(val, val2); + } + if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; + args->reduce[1-args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx=!args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int* done = (int*)malloc(sizeof(int)*ngpus); + memset(done, 0, sizeof(int)*ngpus); + while (remaining) { + int idle = 1; + for (int i=0; i= NCCL_VERSION(2,4,0) + if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i=0; inbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange + size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) { + // printf("startColl, args->nGpus > 1 run ncclGroupStart\n"); + NCCLCHECK(ncclGroupStart()); + } + for (int i = 0; i < args->nGpus; i++) { +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + char* recvBuff = ((char*)args->recvbuffs[i]) + shift; + char* sendBuff = ((char*)args->sendbuffs[i]) + shift; + ncclRedOp_t op; + + if(opIndex < ncclNumOps) { + op = opIndex; + } + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + else { + union { + int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; + half f16; float f32; double f64; + #if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; + #endif + }; + int scalar = preMulScalar(rank); + switch(type) { + case ncclInt8: i8 = int8_t(scalar); break; + case ncclUint8: u8 = uint8_t(scalar); break; + case ncclInt32: i32 = int32_t(scalar); break; + case ncclUint32: u32 = uint32_t(scalar); break; + case ncclInt64: i64 = int32_t(scalar); break; + case ncclUint64: u64 = uint32_t(scalar); break; + case ncclFloat16: f16 = __float2half(float(scalar)); break; + case ncclFloat32: f32 = float(scalar); break; + case ncclFloat64: f64 = double(scalar); break; + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; + #endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); + } + #endif + + TESTCHECK(args->collTest->runColl( + (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), + (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), + count, type, op, root, args->comms[i], args->streams[i])); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + if(opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); + } + #endif + } + if (args->nGpus > 1) { + // printf("startColl, args->nGpus > 1 run ncclGroupEnd\n"); + NCCLCHECK(ncclGroupEnd()); + } + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs* args) { + if (blocking_coll) return testSuccess; + + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { + size_t count = args->nbytes / wordSize(type); + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); + } + + // Sync + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + TESTCHECK(completeColl(args)); + + Barrier(args); + +#if CUDART_VERSION >= 11030 + cudaGraph_t graphs[args->nGpus]; + cudaGraphExec_t graphExec[args->nGpus]; + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture + for (int i=0; inGpus; i++) { + // Thread local mode is needed for: + // - Multi-thread mode + // - P2P pre-connect + CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); + } + } +#endif + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < iters; iter++) { + if (agg_iters>1) NCCLCHECK(ncclGroupStart()); + for (int aiter = 0; aiter < agg_iters; aiter++) { + TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); + } + if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); + } + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// // End cuda graph capture +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); +// } +// // Instantiate cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); +// } +// // Resync CPU, restart timing, launch cuda graph +// Barrier(args); +// start = std::chrono::high_resolution_clock::now(); +// for (int l=0; lnGpus; i++) { +// CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); +// } +// } +// } +// #endif + + TESTCHECK(completeColl(args)); + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec/(iters*agg_iters); + if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + Allreduce(args, &deltaSec, average); + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// //destroy cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphExecDestroy(graphExec[i])); +// CUDACHECK(cudaGraphDestroy(graphs[i])); +// } +// } +// #endif + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); + + Barrier(args); + + double maxDelta = 0; + static __thread int rep = 0; + rep++; + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// // Begin cuda graph capture for data check +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); +// } +// } +// #endif + + //test validation in single itertion, should ideally be included into the multi-iteration run + // TESTCHECK(startColl(args, type, op, root, in_place, 0)); + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// // End cuda graph capture +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); +// } +// // Instantiate cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); +// } +// // Launch cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); +// } +// } +// #endif + + // TESTCHECK(completeColl(args)); + +// #if CUDART_VERSION >= 11030 +// if (cudaGraphLaunches >= 1) { +// //destroy cuda graph +// for (int i=0; inGpus; i++) { +// CUDACHECK(cudaGraphExecDestroy(graphExec[i])); +// CUDACHECK(cudaGraphDestroy(graphs[i])); +// } +// } +// #endif + + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec*1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { + int nranks = args->nProcs*args->nGpus*args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { + // // Warm-up for large size + // setupArgs(args->maxbytes, type, args); + // for (int iter = 0; iter < warmup_iters; iter++) { + // TESTCHECK(startColl(args, type, op, root, 0, iter)); + // } + // TESTCHECK(completeColl(args)); + + // // Warm-up for small size + // setupArgs(args->minbytes, type, args); + // for (int iter = 0; iter < warmup_iters; iter++) { + // TESTCHECK(startColl(args, type, op, root, 0, iter)); + // } + // TESTCHECK(completeColl(args)); + + // Benchmark + for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { + setupArgs(size, type, args); + print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0)); + // TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); + } + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs* args) { + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs* args) { + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + //set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i=0; inGpus; i++) { + int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i=0; inGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void* threadLauncher(void* thread_) { + struct testThread* thread = (struct testThread*)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread* thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char* argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + ncclGetVersion(&test_ncclVersion); + #else + test_ncclVersion = NCCL_VERSION_CODE; + #endif + //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { + test_opnum++; // ncclAvg + #if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 + #endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { + test_opnum++; // PreMulSum + } + #endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {} + }; + + while(1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); + + if (c == -1) + break; + + switch(c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, + (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); + for (int p=0; p 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); + if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i=0; i memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); + } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus*nThreads]; + void* sendbuffs[nGpus*nThreads]; + void* recvbuffs[nGpus*nThreads]; + void* expected[nGpus*nThreads]; + size_t sendBytes, recvBytes; + + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); + + for (int i=0; i=0; t--) { + threads[t].args.minbytes=minBytes; + threads[t].args.maxbytes=maxBytes; + threads[t].args.stepbytes=stepBytes; + threads[t].args.stepfactor=stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs=nProcs; + threads[t].args.proc=proc; + threads[t].args.nThreads=nThreads; + threads[t].args.thread=t; + threads[t].args.nGpus=nGpus; + threads[t].args.sendbuffs = sendbuffs+t*nGpus; + threads[t].args.recvbuffs = recvbuffs+t*nGpus; + threads[t].args.expected = expected+t*nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms=comms+t*nGpus; + threads[t].args.streams=streams+t*nGpus; + + threads[t].args.barrier = (volatile int*)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double*)reduce; + threads[t].args.sync = (volatile int*)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t*NUM_BLOCKS); + threads[t].args.errors=errors+t; + threads[t].args.bw=bw+t; + threads[t].args.bw_count=bw_count+t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads+t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t=nThreads-1; t>=0; t--) { + if (t) pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for(int i=0; i Date: Mon, 18 Jul 2022 02:45:09 +0000 Subject: [PATCH 008/109] ofccl test file --- .gitignore | 2 ++ src_simple/ofccl_all_reduce.cu | 27 +++------------------------ 2 files changed, 5 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index a0a013e..c908b05 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ # # See LICENCE.txt for license information /build + +.clangd \ No newline at end of file diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 9a702ec..9d3ad0c 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -56,30 +56,9 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl } testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - static int round; - ncclGroupStart(); - printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round); - - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 2nd allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 5th allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 6th allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 7th allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 8th allreduce\n", getpid(), round); - - ncclGroupEnd(); - printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round); - round++; + + NCCLCHECK(ofcclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + printf("<%d> ofccl_nccl_test invoke ofcclAllReduce\n", getpid()); return testSuccess; } From 8eba16feff90fa7aa6a2e97965539c738797722f Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 18 Jul 2022 08:54:42 +0000 Subject: [PATCH 009/109] run startColl exactly as we want --- src_simple/common_simple.cu | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 0d88bb3..ba44d36 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -601,26 +601,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); } - // Sync - TESTCHECK(startColl(args, type, op, root, in_place, 0)); - TESTCHECK(completeColl(args)); - Barrier(args); -#if CUDART_VERSION >= 11030 - cudaGraph_t graphs[args->nGpus]; - cudaGraphExec_t graphExec[args->nGpus]; - if (cudaGraphLaunches >= 1) { - // Begin cuda graph capture - for (int i=0; inGpus; i++) { - // Thread local mode is needed for: - // - Multi-thread mode - // - P2P pre-connect - CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); - } - } -#endif - // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { From d6a4d47eddb59d905c4e68928dc0cd2d570ea305 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 18 Jul 2022 10:54:47 +0000 Subject: [PATCH 010/109] ofccl_all_reduce.cu --- src_simple/ofccl_all_reduce.cu | 64 ++++++++++++++++------------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 9d3ad0c..62f8b69 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -77,40 +77,36 @@ void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, in testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { args->collTest = &allReduceTest; - // ncclDataType_t *run_types; - // ncclRedOp_t *run_ops; - // const char **run_typenames, **run_opnames; - // int type_count, op_count; - - // if ((int)type != -1) { - // type_count = 1; - // run_types = &type; - // run_typenames = &typeName; - // } else { - // type_count = test_typenum; - // run_types = test_types; - // run_typenames = test_typenames; - // } - - // if ((int)op != -1) { - // op_count = 1; - // run_ops = &op; - // run_opnames = &opName; - // } else { - // op_count = test_opnum; - // run_ops = test_ops; - // run_opnames = test_opnames; - // } - - // for (int i=0; i %d ofccl_nccl_test invoke TimeTest\n", getpid(), test_round); - test_round++; - TESTCHECK(TimeTest(args, ncclFloat, "float", ncclSum, "sum", -1)); + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i Date: Tue, 19 Jul 2022 15:25:50 +0000 Subject: [PATCH 011/109] add log --- src_simple/common_simple.cu | 5 +++++ src_simple/common_simple.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index ba44d36..e8fc1a6 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -679,6 +679,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* } testResult_t threadRunTests(struct threadArgs* args) { + // OFTEST_LOG1(TEST, "Enter threadRunTests"); // Set device to the first of our GPUs. If we don't do that, some operations // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. @@ -689,6 +690,7 @@ testResult_t threadRunTests(struct threadArgs* args) { } testResult_t threadInit(struct threadArgs* args) { + // OFTEST_LOG1(TEST, "Enter threadInit"); char hostname[1024]; getHostName(hostname, 1024); int nranks = args->nProcs*args->nThreads*args->nGpus; @@ -701,6 +703,7 @@ testResult_t threadInit(struct threadArgs* args) { int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; CUDACHECK(cudaSetDevice(gpuid)); + // OFTEST_LOG1(TEST, "CommInitRank here"); NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); @@ -992,11 +995,13 @@ testResult_t run() { if (nProcs == 1) { int gpuArray[nGpus*nThreads]; for (int i=0; i #include "nccl1_compat.h" +#define OFTEST_LOG(PRE, FMT, args...) printf("\nTEST [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__, args) +#define OFTEST_LOG1(PRE, FMT) printf("\nTEST [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__) +#define OFTEST_LOG0(PRE) printf("\nTEST [%s:%d] <%s> " #PRE, __FILE__, __LINE__, __func__) + #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ From d3de0211576a5e9002908f25ce67f6abdcf0424f Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 19 Jul 2022 18:07:50 +0000 Subject: [PATCH 012/109] add -M option: use seprate ncclComm for different coll op, even with the same dev set --- README.md | 1 + src_simple/common_simple.cu | 1459 ++++++++++++++++------------ src_simple/common_simple.cu.simple | 40 +- 3 files changed, 853 insertions(+), 647 deletions(-) diff --git a/README.md b/README.md index bff6433..1c3c505 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ All tests support the same set of arguments : * `-n,--iters ` number of iterations. Default : 20. * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. + * `-M,--multi_iters ` number of operations with seprate ncclComm in each iteration. Default : 1. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index e8fc1a6..bb64ebc 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -5,46 +5,74 @@ ************************************************************************/ #include "common_simple.h" -#include +#include "cuda.h" +#include "nccl.h" #include +#include #include #include -#include "cuda.h" +#include int test_ncclVersion = 0; // init'd with ncclGetVersion() #if NCCL_MAJOR >= 2 - ncclDataType_t test_types[ncclNumTypes] = { - ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble - #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - , ncclBfloat16 - #endif - }; - const char *test_typenames[ncclNumTypes] = { - "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" - #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - , "bfloat16" - #endif - }; - int test_typenum = -1; - - const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; - ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - , ncclAvg - #endif - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand - #endif - }; - int test_opnum = -1; +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, + ncclUint8, + ncclInt32, + ncclUint32, + ncclInt64, + ncclUint64, + ncclHalf, + ncclFloat, + ncclDouble +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclBfloat16 +#endif +}; +const char *test_typenames[ncclNumTypes] = {"int8", + "uint8", + "int32", + "uint32", + "int64", + "uint64", + "half", + "float", + "double" +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + "bfloat16" +#endif +}; +int test_typenum = -1; + +const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; +ncclRedOp_t test_ops[] = { + ncclSum, + ncclProd, + ncclMax, + ncclMin +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclAvg +#endif +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + , + ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand +#endif +}; +int test_opnum = -1; #else - ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; - const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; - int test_typenum = 7; - const char *test_opnames[] = {"sum", "prod", "max", "min"}; - ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; - int test_opnum = 4; +ncclDataType_t test_types[ncclNumTypes] = { + ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; +const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", + "double", "int64", "uint64"}; +int test_typenum = 7; +const char *test_opnames[] = {"sum", "prod", "max", "min"}; +ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; +int test_opnum = 4; #endif thread_local int is_main_thread = 0; @@ -52,14 +80,15 @@ thread_local int is_main_thread = 0; // Command line parameter defaults static int nThreads = 1; static int nGpus = 1; -static size_t minBytes = 32*1024*1024; -static size_t maxBytes = 32*1024*1024; -static size_t stepBytes = 1*1024*1024; +static size_t minBytes = 32 * 1024 * 1024; +static size_t maxBytes = 32 * 1024 * 1024; +static size_t stepBytes = 1 * 1024 * 1024; static size_t stepFactor = 1; static int datacheck = 1; static int warmup_iters = 5; static int iters = 20; static int agg_iters = 1; +static int multi_iters = 1; static int ncclop = ncclSum; static int nccltype = ncclFloat; static int ncclroot = 0; @@ -72,234 +101,251 @@ static int average = 1; #define NUM_BLOCKS 32 static double parsesize(const char *value) { - long long int units; - double size; - char size_lit; - - int count = sscanf(value, "%lf %1s", &size, &size_lit); - - switch (count) { - case 2: - switch (size_lit) { - case 'G': - case 'g': - units = 1024*1024*1024; - break; - case 'M': - case 'm': - units = 1024*1024; - break; - case 'K': - case 'k': - units = 1024; - break; - default: - return -1.0; - }; + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024 * 1024 * 1024; break; - case 1: - units = 1; + case 'M': + case 'm': + units = 1024 * 1024; + break; + case 'K': + case 'k': + units = 1024; break; default: return -1.0; - } + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } - return size * units; + return size * units; } double DeltaMaxValue(ncclDataType_t type) { - switch(type) { - case ncclHalf: return 1e-2; + switch (type) { + case ncclHalf: + return 1e-2; #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: return 1e-2; + case ncclBfloat16: + return 1e-2; #endif - case ncclFloat: return 1e-5; - case ncclDouble: return 1e-12; - case ncclInt: + case ncclFloat: + return 1e-5; + case ncclDouble: + return 1e-12; + case ncclInt: #if NCCL_MAJOR >= 2 - case ncclUint8: - //case ncclInt32: - case ncclUint32: + case ncclUint8: + // case ncclInt32: + case ncclUint32: #endif - case ncclInt64: - case ncclUint64: return 1e-200; + case ncclInt64: + case ncclUint64: + return 1e-200; } return 1e-200; } -template __device__ -double absDiff(T a, T b) { +template __device__ double absDiff(T a, T b) { return fabs((double)(b - a)); } -template<> __device__ -double absDiff(half a, half b) { +template <> __device__ double absDiff(half a, half b) { float x = __half2float(a); float y = __half2float(b); - return fabs((double)(y-x)); + return fabs((double)(y - x)); } -template __device__ -float toFloat(T a) { - return (float)a; -} -template<> __device__ -float toFloat(half a) { - return __half2float(a); -} +template __device__ float toFloat(T a) { return (float)a; } +template <> __device__ float toFloat(half a) { return __half2float(a); } #if defined(__CUDA_BF16_TYPES_EXIST__) -template<> __device__ -float toFloat(__nv_bfloat16 a) { +template <> __device__ float toFloat(__nv_bfloat16 a) { return __bfloat162float(a); } #endif -template __global__ -void deltaKern(void* A_, void* B_, size_t count, double* max) { - const T* A = (const T*)A_; - const T* B = (const T*)B_; +template +__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) { + const T *A = (const T *)A_; + const T *B = (const T *)B_; __shared__ double temp[BSIZE]; - int tid = blockIdx.x*blockDim.x + threadIdx.x; + int tid = blockIdx.x * blockDim.x + threadIdx.x; double locmax = 0.0; - for(size_t i=tid; i locmax ) { + if (delta > locmax) { locmax = delta; #ifdef DEBUG_PRINT - if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); + if (delta > .1) + printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i, + toFloat(A[i]), toFloat(B[i])); #endif } } tid = threadIdx.x; temp[tid] = locmax; - for(int stride = BSIZE/2; stride > 1; stride>>=1) { + for (int stride = BSIZE / 2; stride > 1; stride >>= 1) { __syncthreads(); - if( tid < stride ) - temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; + if (tid < stride) + temp[tid] = + temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride]; } __syncthreads(); - if( threadIdx.x == 0) + if (threadIdx.x == 0) max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; } -testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { +testResult_t CheckDelta(void *results, void *expected, size_t count, + ncclDataType_t type, double *devmax) { switch (type) { #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: - deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512> + <<>>(results, expected, count, devmax); + break; #endif - case ncclHalf: - deltaKern<<>>(results, expected, count, devmax); break; - case ncclFloat: - deltaKern<<>>(results, expected, count, devmax); break; - case ncclDouble: - deltaKern<<>>(results, expected, count, devmax); break; - - case ncclChar: + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); + break; + case ncclFloat: + deltaKern + <<>>(results, expected, count, devmax); + break; + case ncclDouble: + deltaKern + <<>>(results, expected, count, devmax); + break; + + case ncclChar: #if NCCL_MAJOR >= 2 - case ncclUint8: + case ncclUint8: #endif - deltaKern<<>>(results, expected, count, devmax); break; - case ncclInt: + deltaKern + <<>>(results, expected, count, devmax); + break; + case ncclInt: #if NCCL_MAJOR >= 2 - case ncclUint32: + case ncclUint32: #endif - deltaKern<<>>(results, expected, count, devmax); break; - case ncclInt64: - case ncclUint64: - deltaKern<<>>(results, expected, count, devmax); break; + deltaKern + <<>>(results, expected, count, devmax); + break; + case ncclInt64: + case ncclUint64: + deltaKern + <<>>(results, expected, count, devmax); + break; } CUDACHECK(cudaDeviceSynchronize()); - for (int i=1; i +template __device__ T testValue(const size_t offset, const int rep, const int rank) { - uint8_t v = (rep+rank+offset) % 256; + uint8_t v = (rep + rank + offset) % 256; return (T)v; } // For floating point datatype, we use values between 0 and 1 otherwise the // Product operation will produce NaNs. -template<> -__device__ double testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(double)testValue(offset, rep, rank)); +template <> +__device__ double testValue(const size_t offset, const int rep, + const int rank) { + return 1.0 / (1.0 + (double)testValue(offset, rep, rank)); } -template<> -__device__ float testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(float)testValue(offset, rep, rank)); +template <> +__device__ float testValue(const size_t offset, const int rep, + const int rank) { + return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); } -template<> -__device__ half testValue(const size_t offset, const int rep, const int rank) { +template <> +__device__ half testValue(const size_t offset, const int rep, + const int rank) { return __float2half(testValue(offset, rep, rank)); } #if defined(__CUDA_BF16_TYPES_EXIST__) -template<> -__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { +template <> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, + const int rep, + const int rank) { return __float2bfloat16(testValue(offset, rep, rank)); } #endif // Operations -template -__device__ T ncclOpSum(T a, T b) { return a+b; } -template -__device__ T ncclOpProd(T a, T b) { return a*b; } -template -__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } -template -__device__ T ncclOpMin(T a, T b) { return a __device__ T ncclOpSum(T a, T b) { return a + b; } +template __device__ T ncclOpProd(T a, T b) { return a * b; } +template __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; } +template __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; } // Definitions for half -template<> -__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } -template<> -__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } -template<> -__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } -template<> -__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } - -template -__device__ T ncclPPOpIdent(T x, int arg) { return x; } -template -__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } -template -__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } -template<> -__device__ half ncclPPOpMul(half x, int arg) { - return __float2half(__half2float(x)*float(arg)); +template <> __device__ half ncclOpSum(half a, half b) { + return __float2half(__half2float(a) + __half2float(b)); +} +template <> __device__ half ncclOpProd(half a, half b) { + return __float2half(__half2float(a) * __half2float(b)); +} +template <> __device__ half ncclOpMax(half a, half b) { + return __half2float(a) > __half2float(b) ? a : b; +} +template <> __device__ half ncclOpMin(half a, half b) { + return __half2float(a) < __half2float(b) ? a : b; +} + +template __device__ T ncclPPOpIdent(T x, int arg) { return x; } +template __device__ T ncclPPOpMul(T x, int arg) { + return x * T(arg); } -template<> -__device__ half ncclPPOpDiv(half x, int n) { - return __float2half(__half2float(x)/n); +template __device__ T ncclPPOpDiv(T x, int arg) { + return x / T(arg); +} +template <> __device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x) * float(arg)); +} +template <> __device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x) / n); } #if defined(__CUDA_BF16_TYPES_EXIST__) -template<> -__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { - return __float2bfloat16(__bfloat162float(x)*float(arg)); +template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x) * float(arg)); } -template<> -__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { - return __float2bfloat16(__bfloat162float(x)/n); +template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x) / n); } #endif -__host__ __device__ int preMulScalar(int rank) { - return 1 + rank%2; -} +__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; } -template -__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); +template +__global__ void InitDataReduceKernel(T *data, const size_t N, + const size_t offset, const int rep, + const int nranks) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) { + T val = testValue(o + offset, rep, 0); val = PreOp(val, preMulScalar(0)); - for (int i=1; i(o+offset, rep, i); + for (int i = 1; i < nranks; i++) { + T val1 = testValue(o + offset, rep, i); val1 = PreOp(val1, preMulScalar(i)); val = Op(val, val1); } @@ -307,212 +353,243 @@ __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offse } } -#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ - KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) -#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#define KERN(type, op, preop, postop) \ + (void *)InitDataReduceKernel, preop, postop> +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv) #else - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) #endif -static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { - OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - OPS(__nv_bfloat16) +static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), + OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + OPS(__nv_bfloat16) #endif }; -testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; - CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); +testResult_t InitDataReduce(void *data, const size_t count, const size_t offset, + ncclDataType_t type, ncclRedOp_t op, const int rep, + const int nranks) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep, + (void *)&nranks}; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid, + block, args, 0, cudaStreamDefault)); return testSuccess; } -template -__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o +__global__ void InitDataKernel(T *data, const size_t N, const int rep, + const int rank) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) data[o] = testValue(o, rep, rank); } -static void* const initDataKerns[ncclNumTypes] = { - (void*)InitDataKernel< int8_t>, - (void*)InitDataKernel< uint8_t>, - (void*)InitDataKernel< int32_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< int64_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< half>, - (void*)InitDataKernel< float>, - (void*)InitDataKernel< double>, -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - (void*)InitDataKernel<__nv_bfloat16> +static void *const initDataKerns[ncclNumTypes] = { + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + (void *)InitDataKernel<__nv_bfloat16> #endif }; -template -testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { - T* ptr = (T*)dest; +template +testResult_t InitDataType(void *dest, const size_t N, const int rep, + const int rank) { + T *ptr = (T *)dest; InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); return testSuccess; } -testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; - CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); +testResult_t InitData(void *data, const size_t count, ncclDataType_t type, + const int rep, const int rank) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank}; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, + cudaStreamDefault)); return testSuccess; } -void Barrier(struct threadArgs* args) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); +void Barrier(struct threadArgs *args) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { + if (args->thread + 1 == args->nThreads) { #ifdef MPI_SUPPORT MPI_Barrier(MPI_COMM_WORLD); #endif args->barrier[args->barrier_idx] = 0; } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); + while (args->barrier[args->barrier_idx]) + pthread_yield(); } - args->barrier_idx=!args->barrier_idx; + args->barrier_idx = !args->barrier_idx; } // Inter-thread/process barrier+allreduce -void Allreduce(struct threadArgs* args, double* value, int average) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); +void Allreduce(struct threadArgs *args, double *value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); double val = *value; if (args->thread > 0) { double val2 = args->reduce[args->barrier_idx]; - if (average == 1) val += val2; - if (average == 2) val = std::min(val, val2); - if (average == 3) val = std::max(val, val2); + if (average == 1) + val += val2; + if (average == 2) + val = std::min(val, val2); + if (average == 3) + val = std::max(val, val2); } - if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; + if (average || args->thread == 0) + args->reduce[args->barrier_idx] = val; args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { + if (args->thread + 1 == args->nThreads) { #ifdef MPI_SUPPORT if (average != 0) { MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; - MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1, + MPI_DOUBLE, op, MPI_COMM_WORLD); } #endif - if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; - args->reduce[1-args->barrier_idx] = 0; + if (average == 1) + args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads; + args->reduce[1 - args->barrier_idx] = 0; args->barrier[args->barrier_idx] = 0; } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); + while (args->barrier[args->barrier_idx]) + pthread_yield(); } *value = args->reduce[args->barrier_idx]; - args->barrier_idx=!args->barrier_idx; + args->barrier_idx = !args->barrier_idx; } -testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { - size_t count = args->expectedBytes/wordSize(type); +testResult_t CheckData(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes / wordSize(type); double maxDelta = 0.0; - for (int i=0; inGpus; i++) { + for (int i = 0; i < args->nGpus; i++) { int device; - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); CUDACHECK(cudaSetDevice(device)); - void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; - TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + + args->recvInplaceOffset * rank)) + : args->recvbuffs[i]; + TESTCHECK( + CheckDelta(data, args->expected[i], count, type, args->deltaHost)); maxDelta = std::max(*(args->deltaHost), maxDelta); #ifdef DEBUG_PRINT if (rank == 0) { - int *expectedHost = (int *)malloc(args->expectedBytes); - int *dataHost = (int *)malloc(args->expectedBytes); - - cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); - printf("\n Expected: "); - for(int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, expectedHost[j]); - } - printf("\n"); - - cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); - printf("\n Actual: "); - for (int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, dataHost[j]); - } - printf("\n"); - free(expectedHost); - free(dataHost); + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, + cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for (int j = 0; j < args->expectedBytes / sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j = 0; j < args->expectedBytes / sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); } #endif } - double nranks = args->nProcs*args->nThreads*args->nGpus; - if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + double nranks = args->nProcs * args->nThreads * args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type) * (nranks - 1)) + args->errors[0]++; *delta = maxDelta; return testSuccess; } -testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { +testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams, + ncclComm_t *comms) { cudaError_t cudaErr; int remaining = ngpus; - int* done = (int*)malloc(sizeof(int)*ngpus); - memset(done, 0, sizeof(int)*ngpus); + int *done = (int *)malloc(sizeof(int) * ngpus); + memset(done, 0, sizeof(int) * ngpus); while (remaining) { - int idle = 1; - for (int i=0; i= NCCL_VERSION(2,4,0) - if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { - ncclResult_t ncclAsyncErr; - NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); - if (ncclAsyncErr != ncclSuccess) { - // An asynchronous error happened. Stop the operation and destroy - // the communicator - for (int i=0; i= NCCL_VERSION(2, 4, 0) + if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i = 0; i < ngpus; i++) + NCCLCHECK(ncclCommAbort(comms[i])); + // Abort the perf test + NCCLCHECK(ncclAsyncErr); + } + } #endif - } + } - // We might want to let other threads (including NCCL threads) use the CPU. - if (idle) pthread_yield(); + // We might want to let other threads (including NCCL threads) use the CPU. + if (idle) + pthread_yield(); } free(done); return testSuccess; } -testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) { +testResult_t startColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) { size_t count = args->nbytes / wordSize(type); - // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange size_t totalnbytes = max(args->sendBytes, args->expectedBytes); size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; size_t shift = totalnbytes * (iter % steps); @@ -522,57 +599,89 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t NCCLCHECK(ncclGroupStart()); } for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm); #ifndef NCCL_MAJOR int cudaDev; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); + NCCLCHECK(ncclCommCuDevice(comm, &cudaDev)); CUDACHECK(cudaSetDevice(cudaDev)); #endif - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - char* recvBuff = ((char*)args->recvbuffs[i]) + shift; - char* sendBuff = ((char*)args->sendbuffs[i]) + shift; + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + char *recvBuff = ((char *)args->recvbuffs[i]) + shift; + char *sendBuff = ((char *)args->sendbuffs[i]) + shift; ncclRedOp_t op; - if(opIndex < ncclNumOps) { + if (opIndex < ncclNumOps) { op = opIndex; } - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) else { union { - int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; - half f16; float f32; double f64; - #if defined(__CUDA_BF16_TYPES_EXIST__) + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) __nv_bfloat16 bf16; - #endif +#endif }; int scalar = preMulScalar(rank); - switch(type) { - case ncclInt8: i8 = int8_t(scalar); break; - case ncclUint8: u8 = uint8_t(scalar); break; - case ncclInt32: i32 = int32_t(scalar); break; - case ncclUint32: u32 = uint32_t(scalar); break; - case ncclInt64: i64 = int32_t(scalar); break; - case ncclUint64: u64 = uint32_t(scalar); break; - case ncclFloat16: f16 = __float2half(float(scalar)); break; - case ncclFloat32: f32 = float(scalar); break; - case ncclFloat64: f64 = double(scalar); break; - #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; - #endif + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif } - NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); } - #endif +#endif TESTCHECK(args->collTest->runColl( - (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), - (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), - count, type, op, root, args->comms[i], args->streams[i])); - - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - if(opIndex >= ncclNumOps) { - NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); + (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank + : sendBuff), + (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank + : recvBuff), + count, type, op, root, comm, args->streams[i])); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); } - #endif +#endif } if (args->nGpus > 1) { printf("\nstartColl, args->nGpus > 1 run ncclGroupEnd\n"); @@ -583,18 +692,21 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t // Complete op before returning TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); } - if (blocking_coll) Barrier(args); + if (blocking_coll) + Barrier(args); return testSuccess; } -testResult_t completeColl(struct threadArgs* args) { - if (blocking_coll) return testSuccess; +testResult_t completeColl(struct threadArgs *args) { + if (blocking_coll) + return testSuccess; TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); return testSuccess; } -testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { +testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); if (datacheck) { // Initialize sendbuffs, recvbuffs and expected @@ -606,24 +718,36 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { - if (agg_iters>1) NCCLCHECK(ncclGroupStart()); - for (int aiter = 0; aiter < agg_iters; aiter++) { - TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); + if (multi_iters > 1) { + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(startColl(args, type, op, root, in_place, + iter * multi_iters + miter, miter)); + } + } else { + if (agg_iters > 1) + NCCLCHECK(ncclGroupStart()); + for (int aiter = 0; aiter < agg_iters; aiter++) { + TESTCHECK(startColl(args, type, op, root, in_place, + iter * agg_iters + aiter, 0)); + } + if (agg_iters > 1) + NCCLCHECK(ncclGroupEnd()); } - if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); } TESTCHECK(completeColl(args)); auto delta = std::chrono::high_resolution_clock::now() - start; - double deltaSec = std::chrono::duration_cast>(delta).count(); - deltaSec = deltaSec/(iters*agg_iters); - if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + double deltaSec = + std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec / (iters * agg_iters *multi_iters); + if (cudaGraphLaunches >= 1) + deltaSec = deltaSec / cudaGraphLaunches; Allreduce(args, &deltaSec, average); - double algBw, busBw; - args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, + args->nProcs * args->nThreads * args->nGpus); Barrier(args); @@ -631,8 +755,22 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t static __thread int rep = 0; rep++; + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); + + //test validation in single itertion, should ideally be included into the multi-iteration run + TESTCHECK(startColl(args, type, op, root, in_place, 0, 0)); + + TESTCHECK(completeColl(args)); - double timeUsec = deltaSec*1.0E6; + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec * 1.0E6; char timeStr[100]; if (timeUsec >= 10000.0) { sprintf(timeStr, "%7.0f", timeUsec); @@ -642,9 +780,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t sprintf(timeStr, "%7.2f", timeUsec); } if (datacheck) { - PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); } else { - PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); } args->bw[0] += busBw; @@ -652,12 +790,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t return testSuccess; } -void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { - int nranks = args->nProcs*args->nGpus*args->nThreads; - size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) { + int nranks = args->nProcs * args->nGpus * args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, + recvInplaceOffset; count = size / wordSize(type); - args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, + &sendInplaceOffset, &recvInplaceOffset, + (size_t)count, (size_t)nranks); args->nbytes = paramCount * wordSize(type); args->sendBytes = sendCount * wordSize(type); @@ -666,237 +807,282 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { args->recvInplaceOffset = recvInplaceOffset * wordSize(type); } -testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { +testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, + const char *typeName, ncclRedOp_t op, const char *opName, + int root) { + // Warm-up for large size + setupArgs(args->maxbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(startColl(args, type, op, root, 0, + iter * multi_iters + miter, miter)); + } + } + TESTCHECK(completeColl(args)); + + // Warm-up for small size + setupArgs(args->minbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(startColl(args, type, op, root, 0, + iter * multi_iters + miter, miter)); + } + } + TESTCHECK(completeColl(args)); // Benchmark - for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { - setupArgs(size, type, args); - print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); - TESTCHECK(BenchTime(args, type, op, root, 0)); - PRINT("\n"); + for (size_t size = args->minbytes; size <= args->maxbytes; + size = ((args->stepfactor > 1) ? size * args->stepfactor + : size + args->stepbytes)) { + setupArgs(size, type, args); + print_line_header(max(args->sendBytes, args->expectedBytes), + args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0)); + // TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); } return testSuccess; } -testResult_t threadRunTests(struct threadArgs* args) { +testResult_t threadRunTests(struct threadArgs *args) { // OFTEST_LOG1(TEST, "Enter threadRunTests"); // Set device to the first of our GPUs. If we don't do that, some operations // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus; CUDACHECK(cudaSetDevice(gpuid)); - TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, + test_typenames[nccltype], + (ncclRedOp_t)ncclop, test_opnames[ncclop])); return testSuccess; } -testResult_t threadInit(struct threadArgs* args) { +testResult_t threadInit(struct threadArgs *args) { // OFTEST_LOG1(TEST, "Enter threadInit"); char hostname[1024]; getHostName(hostname, 1024); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs * args->nThreads * args->nGpus; - //set main thread again + // set main thread again is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; NCCLCHECK(ncclGroupStart()); - for (int i=0; inGpus; i++) { - int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + for (int i = 0; i < args->nGpus; i++) { + int rank = args->proc * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; CUDACHECK(cudaSetDevice(gpuid)); // OFTEST_LOG1(TEST, "CommInitRank here"); - NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); + NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); TESTCHECK(threadRunTests(args)); - for (int i=0; inGpus; i++) { + for (int i = 0; i < args->nGpus; i++) { NCCLCHECK(ncclCommDestroy(args->comms[i])); } return testSuccess; } -void* threadLauncher(void* thread_) { - struct testThread* thread = (struct testThread*)thread_; +void *threadLauncher(void *thread_) { + struct testThread *thread = (struct testThread *)thread_; thread->ret = thread->func(&thread->args); return NULL; } -testResult_t threadLaunch(struct testThread* thread) { +testResult_t threadLaunch(struct testThread *thread) { pthread_create(&thread->thread, NULL, threadLauncher, thread); return testSuccess; } -testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { - CUDACHECK(cudaMalloc(sendbuff, nbytes)); - CUDACHECK(cudaMalloc(recvbuff, nbytes)); - if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); - return testSuccess; +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, + size_t recvBytes, void **expected, size_t nbytes, + int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) + CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; } testResult_t run(); // Main function -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { // Make sure everyline is flushed so that we see the progress of the test setlinebuf(stdout); - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) - ncclGetVersion(&test_ncclVersion); - #else - test_ncclVersion = NCCL_VERSION_CODE; - #endif - //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) - test_opnum = 4; - test_typenum = 9; - if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { - test_opnum++; // ncclAvg - #if defined(__CUDA_BF16_TYPES_EXIST__) - test_typenum++; // bfloat16 - #endif - } - if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { - test_opnum++; // PreMulSum - } - #endif +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + ncclGetVersion(&test_ncclVersion); +#else + test_ncclVersion = NCCL_VERSION_CODE; +#endif +// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, +// test_ncclVersion); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) && + test_ncclVersion >= NCCL_VERSION(2, 10, 0)) { + test_opnum++; // ncclAvg +#if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 +#endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) && + test_ncclVersion >= NCCL_VERSION(2, 11, 0)) { + test_opnum++; // PreMulSum + } +#endif // Parse args double parsed; int longindex; static struct option longopts[] = { - {"nthreads", required_argument, 0, 't'}, - {"ngpus", required_argument, 0, 'g'}, - {"minbytes", required_argument, 0, 'b'}, - {"maxbytes", required_argument, 0, 'e'}, - {"stepbytes", required_argument, 0, 'i'}, - {"stepfactor", required_argument, 0, 'f'}, - {"iters", required_argument, 0, 'n'}, - {"agg_iters", required_argument, 0, 'm'}, - {"warmup_iters", required_argument, 0, 'w'}, - {"parallel_init", required_argument, 0, 'p'}, - {"check", required_argument, 0, 'c'}, - {"op", required_argument, 0, 'o'}, - {"datatype", required_argument, 0, 'd'}, - {"root", required_argument, 0, 'r'}, - {"blocking", required_argument, 0, 'z'}, - {"cudagraph", required_argument, 0, 'G'}, - {"average", required_argument, 0, 'a'}, - {"help", no_argument, 0, 'h'}, - {} - }; - - while(1) { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"multi_iters", required_argument, 0, 'M'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {}}; + + while (1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts, + &longindex); if (c == -1) break; - switch(c) { - case 't': - nThreads = strtol(optarg, NULL, 0); - break; - case 'g': - nGpus = strtol(optarg, NULL, 0); - break; - case 'b': - parsed = parsesize(optarg); - if (parsed < 0) { - fprintf(stderr, "invalid size specified for 'minbytes'\n"); - return -1; - } - minBytes = (size_t)parsed; - break; - case 'e': - parsed = parsesize(optarg); - if (parsed < 0) { - fprintf(stderr, "invalid size specified for 'maxbytes'\n"); - return -1; - } - maxBytes = (size_t)parsed; - break; - case 'i': - stepBytes = strtol(optarg, NULL, 0); - break; - case 'f': - stepFactor = strtol(optarg, NULL, 0); - break; - case 'n': - iters = (int)strtol(optarg, NULL, 0); - break; - case 'm': + switch (c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'M': + multi_iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': #if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) - agg_iters = (int)strtol(optarg, NULL, 0); + agg_iters = (int)strtol(optarg, NULL, 0); #else - fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); #endif - break; - case 'w': - warmup_iters = (int)strtol(optarg, NULL, 0); - break; - case 'c': - datacheck = (int)strtol(optarg, NULL, 0); - break; - case 'p': - parallel_init = (int)strtol(optarg, NULL, 0); - break; - case 'o': - ncclop = ncclstringtoop(optarg); - break; - case 'd': - nccltype = ncclstringtotype(optarg); - break; - case 'r': - ncclroot = strtol(optarg, NULL, 0); - break; - case 'z': - blocking_coll = strtol(optarg, NULL, 0); - break; - case 'G': -#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 - cudaGraphLaunches = strtol(optarg, NULL, 0); + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && \ + CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); #else - printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA " + "11.3. Ignoring\n"); #endif - break; - case 'a': - average = (int)strtol(optarg, NULL, 0); - break; - case 'h': - default: - if (c != 'h') printf("invalid option '%c'\n", c); - printf("USAGE: %s \n\t" - "[-t,--nthreads ] \n\t" - "[-g,--ngpus ] \n\t" - "[-b,--minbytes ] \n\t" - "[-e,--maxbytes ] \n\t" - "[-i,--stepbytes ] \n\t" - "[-f,--stepfactor ] \n\t" - "[-n,--iters ] \n\t" - "[-m,--agg_iters ] \n\t" - "[-w,--warmup_iters ] \n\t" - "[-p,--parallel_init <0/1>] \n\t" - "[-c,--check <0/1>] \n\t" -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - "[-o,--op ] \n\t" -#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - "[-o,--op ] \n\t" + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') + printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-M,--multi_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + "[-o,--op ] \n\t" #else - "[-o,--op ] \n\t" + "[-o,--op ] \n\t" #endif - "[-d,--datatype ] \n\t" - "[-r,--root ] \n\t" - "[-z,--blocking <0/1>] \n\t" - "[-G,--cudagraph ] \n\t" - "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" - "[-h,--help]\n", - basename(argv[0])); - return 0; + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time " + "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; } } if (minBytes > maxBytes) { - fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", - (unsigned long long)minBytes, - (unsigned long long)maxBytes); + fprintf(stderr, + "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, (unsigned long long)maxBytes); return -1; } #ifdef MPI_SUPPORT @@ -917,18 +1103,28 @@ testResult_t run() { MPI_Comm_rank(MPI_COMM_WORLD, &proc); uint64_t hostHashs[nProcs]; hostHashs[proc] = getHostHash(hostname); - MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); - for (int p=0; p 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); - if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); - if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup " + "iters: %d iters: %d validation: %d \n", + nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1) ? stepFactor : stepBytes, + (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck); + if (blocking_coll) + PRINT("# Blocking Enabled: wait for completion and barrier after each " + "collective \n"); + if (parallel_init) + PRINT("# Parallel Init Enabled: threads call into NcclInitRank " + "concurrently \n"); PRINT("#\n"); PRINT("# Using devices\n"); @@ -936,23 +1132,26 @@ testResult_t run() { char line[MAX_LINE]; int len = 0; size_t maxMem = ~0; - for (int i=0; i memMaxBytes) { maxBytes = memMaxBytes; - if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); + if (proc == 0) + printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", + maxBytes); } ncclUniqueId ncclId; @@ -975,45 +1177,73 @@ testResult_t run() { MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif - cudaStream_t streams[nGpus*nThreads]; - void* sendbuffs[nGpus*nThreads]; - void* recvbuffs[nGpus*nThreads]; - void* expected[nGpus*nThreads]; + cudaStream_t streams[nGpus * nThreads]; + void *sendbuffs[nGpus * nThreads]; + void *recvbuffs[nGpus * nThreads]; + void *expected[nGpus * nThreads]; size_t sendBytes, recvBytes; - ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, + (size_t)nProcs * nGpus * nThreads); - for (int i=0; i=0; t--) { - threads[t].args.minbytes=minBytes; - threads[t].args.maxbytes=maxBytes; - threads[t].args.stepbytes=stepBytes; - threads[t].args.stepfactor=stepFactor; + for (int t = nThreads - 1; t >= 0; t--) { + threads[t].args.minbytes = minBytes; + threads[t].args.maxbytes = maxBytes; + threads[t].args.stepbytes = stepBytes; + threads[t].args.stepfactor = stepFactor; threads[t].args.localRank = localRank; - threads[t].args.nProcs=nProcs; - threads[t].args.proc=proc; - threads[t].args.nThreads=nThreads; - threads[t].args.thread=t; - threads[t].args.nGpus=nGpus; - threads[t].args.sendbuffs = sendbuffs+t*nGpus; - threads[t].args.recvbuffs = recvbuffs+t*nGpus; - threads[t].args.expected = expected+t*nGpus; + threads[t].args.nProcs = nProcs; + threads[t].args.proc = proc; + threads[t].args.nThreads = nThreads; + threads[t].args.thread = t; + threads[t].args.nGpus = nGpus; + threads[t].args.sendbuffs = sendbuffs + t * nGpus; + threads[t].args.recvbuffs = recvbuffs + t * nGpus; + threads[t].args.expected = expected + t * nGpus; threads[t].args.ncclId = ncclId; - threads[t].args.comms=comms+t*nGpus; - threads[t].args.streams=streams+t*nGpus; + threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus; + // for (int i = 0; i < multi_iters * nGpus; i++) { + // OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i); + // } - threads[t].args.barrier = (volatile int*)barrier; + threads[t].args.streams = streams + t * nGpus; + + threads[t].args.barrier = (volatile int *)barrier; threads[t].args.barrier_idx = 0; - threads[t].args.reduce = (volatile double*)reduce; - threads[t].args.sync = (volatile int*)sync; + threads[t].args.reduce = (volatile double *)reduce; + threads[t].args.sync = (volatile int *)sync; threads[t].args.sync_idx = 0; - threads[t].args.deltaHost = (delta + t*NUM_BLOCKS); - threads[t].args.errors=errors+t; - threads[t].args.bw=bw+t; - threads[t].args.bw_count=bw_count+t; + threads[t].args.deltaHost = (delta + t * NUM_BLOCKS); + threads[t].args.errors = errors + t; + threads[t].args.bw = bw + t; + threads[t].args.bw_count = bw_count + t; threads[t].args.reportErrors = 1; threads[t].func = parallel_init ? threadInit : threadRunTests; if (t) - TESTCHECK(threadLaunch(threads+t)); + TESTCHECK(threadLaunch(threads + t)); else TESTCHECK(threads[t].func(&threads[t].args)); } // Wait for other threads and accumulate stats and errors - for (int t=nThreads-1; t>=0; t--) { - if (t) pthread_join(threads[t].thread, NULL); + for (int t = nThreads - 1; t >= 0; t--) { + if (t) + pthread_join(threads[t].thread, NULL); TESTCHECK(threads[t].ret); if (t) { errors[0] += errors[t]; @@ -1082,25 +1317,31 @@ testResult_t run() { #endif if (!parallel_init) { - for(int i=0; icollTest->initData(args, type, op, root, rep, in_place)); -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// // Begin cuda graph capture for data check -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); -// } -// } -// #endif - //test validation in single itertion, should ideally be included into the multi-iteration run - // TESTCHECK(startColl(args, type, op, root, in_place, 0)); - -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// // End cuda graph capture -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); -// } -// // Instantiate cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); -// } -// // Launch cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); -// } -// } -// #endif - - // TESTCHECK(completeColl(args)); + TESTCHECK(startColl(args, type, op, root, in_place, 0)); -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// //destroy cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphExecDestroy(graphExec[i])); -// CUDACHECK(cudaGraphDestroy(graphs[i])); -// } -// } -// #endif + TESTCHECK(completeColl(args)); TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); From 76b1cd7ed62694b63a879637f81f2d20d8f42ce6 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 19 Jul 2022 18:16:08 +0000 Subject: [PATCH 013/109] remove log --- src_simple/common_simple.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index bb64ebc..706db22 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -600,7 +600,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type, } for (int i = 0; i < args->nGpus; i++) { ncclComm_t comm = args->comms[miter * nGpus + i]; - OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm); + // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm); #ifndef NCCL_MAJOR int cudaDev; NCCLCHECK(ncclCommCuDevice(comm, &cudaDev)); From a466914a2f9e034a3ff25e33ace984117d0feb4a Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 28 Jul 2022 16:50:23 +0000 Subject: [PATCH 014/109] use prepare and done in nccl-tests --- src_simple/common_simple.cu | 108 ++++++++++++++++++++++++++++++++- src_simple/common_simple.h | 9 +-- src_simple/ofccl_all_reduce.cu | 16 +++-- 3 files changed, 122 insertions(+), 11 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 706db22..e34c2b1 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -584,6 +584,94 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams, return testSuccess; } +testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) { + size_t count = args->nbytes / wordSize(type); + if (args->nGpus != 1) { + OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs"); + return testInternalError; + } + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); + + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + + return testSuccess; +} + testResult_t startColl(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) { size_t count = args->nbytes / wordSize(type); @@ -595,7 +683,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type, size_t shift = totalnbytes * (iter % steps); if (args->nGpus > 1) { - printf("\nstartColl, args->nGpus > 1 run ncclGroupStart\n"); + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart"); NCCLCHECK(ncclGroupStart()); } for (int i = 0; i < args->nGpus; i++) { @@ -684,7 +772,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type, #endif } if (args->nGpus > 1) { - printf("\nstartColl, args->nGpus > 1 run ncclGroupEnd\n"); + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd"); NCCLCHECK(ncclGroupEnd()); } @@ -809,7 +897,21 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) { testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, const char *typeName, ncclRedOp_t op, const char *opName, - int root) { + int root, bool is_ofccl) { + if (is_ofccl) { + // prepare for all size. op, type traversed in the caller. + for (size_t size = args->minbytes; size <= args->maxbytes; + size = ((args->stepfactor > 1) ? size * args->stepfactor + : size + args->stepbytes)) { + setupArgs(size, type, args); + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter)); + } + } + + ofcclPrepareDone(); + } + // Warm-up for large size setupArgs(args->maxbytes, type, args); for (int iter = 0; iter < warmup_iters; iter++) { diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index caaafef..b5c85a1 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -16,9 +16,9 @@ #include #include "nccl1_compat.h" -#define OFTEST_LOG(PRE, FMT, args...) printf("\nTEST [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__, args) -#define OFTEST_LOG1(PRE, FMT) printf("\nTEST [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__) -#define OFTEST_LOG0(PRE) printf("\nTEST [%s:%d] <%s> " #PRE, __FILE__, __LINE__, __func__) +#define OFTEST_LOG(PRE, FMT, args...) printf("\n(testlog) [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__, args) +#define OFTEST_LOG1(PRE, FMT) printf("\n(testlog) [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__) +#define OFTEST_LOG0(PRE) printf("\n(testlog) [%s:%d] <%s> " #PRE, __FILE__, __LINE__, __func__) #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ @@ -75,6 +75,7 @@ struct testColl { void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId); }; extern struct testColl allReduceTest; extern struct testColl allGatherTest; @@ -144,7 +145,7 @@ struct testThread { // Provided by common.cu extern void Barrier(struct threadArgs* args); -extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, bool is_ofccl=false); extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 62f8b69..cda3f34 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -57,8 +57,15 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - NCCLCHECK(ofcclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> ofccl_nccl_test invoke ofcclAllReduce\n", getpid()); + // NCCLCHECK(ofcclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + // OFTEST_LOG1(TEST, "UNIMPLEMENTED ofcclAllReduce"); + return testSuccess; +} + +testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId) { + + NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId)); + // OFTEST_LOG(TEST, "invoke ofcclPrepareAllReduce with count=%lu, collId=%d", count, collId); return testSuccess; } @@ -67,7 +74,8 @@ struct testColl allReduceTest = { AllReduceGetCollByteCount, AllReduceInitData, AllReduceGetBw, - AllReduceRunColl + AllReduceRunColl, + AllReducePrepare }; void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { @@ -104,7 +112,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t for (int i=0; i Date: Tue, 9 Aug 2022 07:00:56 +0000 Subject: [PATCH 015/109] check no reused ncclComm in ofcclCommList --- src_simple/common_simple.cu | 1 + src_simple/ofccl_all_reduce.cu | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index e34c2b1..44ffe21 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -1297,6 +1297,7 @@ testResult_t run() { } // if parallel init is not selected, use main thread to initialize NCCL + // TODO: assign more comms when use multi size. ncclComm_t *comms = (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); ncclComm_t *adjusted_comms = diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index cda3f34..4d9af93 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -8,6 +8,7 @@ #include "common_simple.h" #include #include +#include void print_header() { PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); @@ -65,7 +66,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, nccl testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId) { NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId)); - // OFTEST_LOG(TEST, "invoke ofcclPrepareAllReduce with count=%lu, collId=%d", count, collId); + OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); return testSuccess; } From 818c8e3fa0651757f490668f64f63056761cd54b Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 9 Aug 2022 08:17:19 +0000 Subject: [PATCH 016/109] invoke ofcclDestroy --- src_simple/common_simple.cu | 10 +++++++++- src_simple/ofccl_all_reduce.cu | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 44ffe21..2320dc9 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -900,6 +900,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, int root, bool is_ofccl) { if (is_ofccl) { // prepare for all size. op, type traversed in the caller. + // TODO: if we support multi size, each size should use a separate ncclComm for (size_t size = args->minbytes; size <= args->maxbytes; size = ((args->stepfactor > 1) ? size * args->stepfactor : size + args->stepbytes)) { @@ -912,6 +913,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, ofcclPrepareDone(); } + // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 // Warm-up for large size setupArgs(args->maxbytes, type, args); for (int iter = 0; iter < warmup_iters; iter++) { @@ -943,6 +945,12 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, // TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); } + + if (is_ofccl) { + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self()); + ofcclDestroy(); + } + return testSuccess; } @@ -1307,7 +1315,7 @@ testResult_t run() { int gpuArray[nGpus * nThreads]; for (int i = 0; i < nGpus * nThreads; i++) gpuArray[i] = i; - OFTEST_LOG1(TEST, "CommInitAll here"); + // OFTEST_LOG1(TEST, "CommInitAll here"); // use seprate comm // TODO: we do not support MPI now. for (int miter = 0; miter < multi_iters; miter++) { diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 4d9af93..714365d 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -66,7 +66,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, nccl testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId) { NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId)); - OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); return testSuccess; } From 67e70b93f9cf73ec227f8188a42b1f0060970ecc Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 23 Aug 2022 03:15:33 +0000 Subject: [PATCH 017/109] use ofcclRunAllReduce --- .gitignore | 4 +++- src_simple/Makefile | 3 ++- src_simple/common_simple.cu | 3 +-- src_simple/common_simple.h | 3 +-- src_simple/ofccl_all_reduce.cu | 4 ++-- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index c908b05..b0853be 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,6 @@ # See LICENCE.txt for license information /build -.clangd \ No newline at end of file +.clangd + +.vscode \ No newline at end of file diff --git a/src_simple/Makefile b/src_simple/Makefile index 3247401..de282de 100644 --- a/src_simple/Makefile +++ b/src_simple/Makefile @@ -75,7 +75,8 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce_group all_reduce_simple ofccl_all_reduce +# BIN_FILES_LIST := all_reduce_group all_reduce_simple ofccl_all_reduce +BIN_FILES_LIST := ofccl_all_reduce BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 2320dc9..b03fbd9 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -762,8 +762,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type, (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank : sendBuff), (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank - : recvBuff), - count, type, op, root, comm, args->streams[i])); + : recvBuff), miter)); #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) if (opIndex >= ncclNumOps) { diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index b5c85a1..1fb299d 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -73,8 +73,7 @@ struct testColl { testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place); void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); - testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId); testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId); }; extern struct testColl allReduceTest; diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 714365d..63744ae 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -56,9 +56,9 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId) { - // NCCLCHECK(ofcclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId)); // OFTEST_LOG1(TEST, "UNIMPLEMENTED ofcclAllReduce"); return testSuccess; } From 97f58bcbb70611c3b8caba914b3c5b295087802c Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 26 Aug 2022 06:56:24 +0000 Subject: [PATCH 018/109] use callback --- src_simple/common_simple.h | 6 +++--- src_simple/ofccl_all_reduce.cu | 23 +++++++++++++++++++++-- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index 1fb299d..9b82e5a 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -16,9 +16,9 @@ #include #include "nccl1_compat.h" -#define OFTEST_LOG(PRE, FMT, args...) printf("\n(testlog) [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__, args) -#define OFTEST_LOG1(PRE, FMT) printf("\n(testlog) [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__) -#define OFTEST_LOG0(PRE) printf("\n(testlog) [%s:%d] <%s> " #PRE, __FILE__, __LINE__, __func__) +#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) +#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) +#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__) #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 63744ae..a764338 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -9,6 +9,7 @@ #include #include #include +#include void print_header() { PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); @@ -57,9 +58,27 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl } testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId) { + int gotCqe = 0; + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + auto callback = [&](int collIdFromCqe){ + if (collId != collIdFromCqe) { + // TODO: more robust error handle. + OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + return -1; + } + gotCqe = 1; + OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + return 0; + }; + + NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, callback)); - NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId)); - // OFTEST_LOG1(TEST, "UNIMPLEMENTED ofcclAllReduce"); + // TODO: 这会损害带宽测量的结果,之后在common_simple.cu里搞个数组,统一等待。 + while(gotCqe == 0) { + sched_yield(); + } + return testSuccess; } From 8a3d5f88511063db09d10c3d48baf229ceb75c5c Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 26 Aug 2022 09:47:13 +0000 Subject: [PATCH 019/109] use func-ptr for callback, instead of std::function and lambda --- src_simple/common_simple.h | 5 +++++ src_simple/ofccl_all_reduce.cu | 34 +++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index 9b82e5a..82a581c 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -64,6 +64,11 @@ typedef enum { } \ } while(0) +typedef struct { + int collId; + int gotCqe; +} CallBackArgs; + struct testColl { const char name[20]; void (*getCollByteCount)( diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index a764338..ed80fa3 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -57,25 +57,33 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } +int myCallback(int collIdFromCqe, void *args) { + // TODO: 不打log把这里删了 + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + int collId = ((CallBackArgs *)args)->collId; + if (collId != collIdFromCqe) { + // TODO: more robust error handle. + OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + return -1; + } + ((CallBackArgs *)args)->gotCqe = 1; + OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + return 0; +} + testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId) { - int gotCqe = 0; int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); - auto callback = [&](int collIdFromCqe){ - if (collId != collIdFromCqe) { - // TODO: more robust error handle. - OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); - return -1; - } - gotCqe = 1; - OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); - return 0; - }; - NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, callback)); + CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); + args->collId = collId; + args->gotCqe = 0; + + NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args)); // TODO: 这会损害带宽测量的结果,之后在common_simple.cu里搞个数组,统一等待。 - while(gotCqe == 0) { + while(args->gotCqe == 0) { sched_yield(); } From a3a1aea2cd278e80784470b61aa8a679c7bf537e Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 28 Aug 2022 10:22:01 +0000 Subject: [PATCH 020/109] stuck --- src_simple/common_simple.cu | 21 ++++++++++++++++++--- src_simple/common_simple.h | 4 +++- src_simple/ofccl_all_reduce.cu | 11 +++-------- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index b03fbd9..d123f54 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -100,6 +100,9 @@ static int average = 1; #define NUM_BLOCKS 32 +static thread_local CallBackArgs cbArgList[MAX_COLL_NUM]; +static thread_local int seenCqe[MAX_COLL_NUM]; + static double parsesize(const char *value) { long long int units; double size; @@ -757,12 +760,12 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type, &op, &u64, type, ncclScalarHostImmediate, comm)); } #endif - + // miter就是collId。 TESTCHECK(args->collTest->runColl( (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank : sendBuff), (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank - : recvBuff), miter)); + : recvBuff), miter, cbArgList + miter)); #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) if (opIndex >= ncclNumOps) { @@ -787,8 +790,20 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type, testResult_t completeColl(struct threadArgs *args) { if (blocking_coll) return testSuccess; + + int gotCqeCnt = 0; + while (gotCqeCnt < multi_iters) { + for (int i = 0; i < multi_iters; i++) { + if (cbArgList[i].gotCqe == 1) { + if (seenCqe[i] == 0) { + gotCqeCnt++; + seenCqe[i] = 1; + } + } + } + } - TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + // TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); return testSuccess; } diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index 82a581c..c8e94e6 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -69,6 +69,8 @@ typedef struct { int gotCqe; } CallBackArgs; +#define MAX_COLL_NUM 10000 + struct testColl { const char name[20]; void (*getCollByteCount)( @@ -78,7 +80,7 @@ struct testColl { testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place); void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); - testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args); testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId); }; extern struct testColl allReduceTest; diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index ed80fa3..0f9fef2 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -58,7 +58,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl } int myCallback(int collIdFromCqe, void *args) { - // TODO: 不打log把这里删了 + // TODO: 不打log把这里删了,不然影响性能。 int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); int collId = ((CallBackArgs *)args)->collId; @@ -72,20 +72,15 @@ int myCallback(int collIdFromCqe, void *args) { return 0; } -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId) { +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args) { int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); - CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); + // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); args->collId = collId; args->gotCqe = 0; NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args)); - - // TODO: 这会损害带宽测量的结果,之后在common_simple.cu里搞个数组,统一等待。 - while(args->gotCqe == 0) { - sched_yield(); - } return testSuccess; } From 7ff3ea5deeb045105c226767557385b27c0812e8 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 28 Aug 2022 10:45:21 +0000 Subject: [PATCH 021/109] completeColl in warmup result in stuck --- src_simple/common_simple.cu | 37 +++++++++++++++++----------------- src_simple/ofccl_all_reduce.cu | 1 + 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index d123f54..931fd6e 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -928,25 +928,26 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, } // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 + // TODO: 同时如果要warmup的话,也要准备相应的callbackArgs。比较麻烦;可以考虑对比实验的时候,nccl和ofccl都不开warmup。 // Warm-up for large size - setupArgs(args->maxbytes, type, args); - for (int iter = 0; iter < warmup_iters; iter++) { - for (int miter = 0; miter < multi_iters; miter++) { - TESTCHECK(startColl(args, type, op, root, 0, - iter * multi_iters + miter, miter)); - } - } - TESTCHECK(completeColl(args)); - - // Warm-up for small size - setupArgs(args->minbytes, type, args); - for (int iter = 0; iter < warmup_iters; iter++) { - for (int miter = 0; miter < multi_iters; miter++) { - TESTCHECK(startColl(args, type, op, root, 0, - iter * multi_iters + miter, miter)); - } - } - TESTCHECK(completeColl(args)); + // setupArgs(args->maxbytes, type, args); + // for (int iter = 0; iter < warmup_iters; iter++) { + // for (int miter = 0; miter < multi_iters; miter++) { + // TESTCHECK(startColl(args, type, op, root, 0, + // iter * multi_iters + miter, miter)); + // } + // } + // TESTCHECK(completeColl(args)); + + // // Warm-up for small size + // setupArgs(args->minbytes, type, args); + // for (int iter = 0; iter < warmup_iters; iter++) { + // for (int miter = 0; miter < multi_iters; miter++) { + // TESTCHECK(startColl(args, type, op, root, 0, + // iter * multi_iters + miter, miter)); + // } + // } + // TESTCHECK(completeColl(args)); // Benchmark for (size_t size = args->minbytes; size <= args->maxbytes; diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 0f9fef2..0b6aacf 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -81,6 +81,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa args->gotCqe = 0; NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args)); + OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); return testSuccess; } From ee76beb6f3b204b97bf4e5d2b82530ef449bf111 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 28 Aug 2022 12:38:09 +0000 Subject: [PATCH 022/109] +lock --- src_simple/common_simple.cu | 14 ++++++++++++-- src_simple/common_simple.h | 1 + src_simple/ofccl_all_reduce.cu | 3 +++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 931fd6e..5f3aadc 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -790,25 +790,35 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type, testResult_t completeColl(struct threadArgs *args) { if (blocking_coll) return testSuccess; + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); int gotCqeCnt = 0; while (gotCqeCnt < multi_iters) { for (int i = 0; i < multi_iters; i++) { + pthread_mutex_lock(&cbArgList[i].mutex); if (cbArgList[i].gotCqe == 1) { if (seenCqe[i] == 0) { gotCqeCnt++; seenCqe[i] = 1; } } + pthread_mutex_unlock(&cbArgList[i].mutex); } + // OFTEST_LOG(TEST, "<%lu> rank=%d, completeColl gotCqeCnt = %d", pthread_self(), cudaDev, gotCqeCnt); } // TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); return testSuccess; } -testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, - ncclRedOp_t op, int root, int in_place) { +testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, multi_iters = %d", pthread_self(), cudaDev, multi_iters); + size_t count = args->nbytes / wordSize(type); if (datacheck) { // Initialize sendbuffs, recvbuffs and expected diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index c8e94e6..bf2d0fd 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -67,6 +67,7 @@ typedef enum { typedef struct { int collId; int gotCqe; + pthread_mutex_t mutex; } CallBackArgs; #define MAX_COLL_NUM 10000 diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 0b6aacf..b022b98 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -67,7 +67,9 @@ int myCallback(int collIdFromCqe, void *args) { OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); return -1; } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); ((CallBackArgs *)args)->gotCqe = 1; + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); return 0; } @@ -79,6 +81,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); args->collId = collId; args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args)); OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); From 3f0a8fea86fd275a0e1748b71207848ede52d95b Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 29 Aug 2022 09:07:35 +0000 Subject: [PATCH 023/109] tidy log --- src_simple/common_simple.cu | 8 ++++---- src_simple/ofccl_all_reduce.cu | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 5f3aadc..137288c 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -814,10 +814,6 @@ testResult_t completeColl(struct threadArgs *args) { } testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { - - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, multi_iters = %d", pthread_self(), cudaDev, multi_iters); size_t count = args->nbytes / wordSize(type); if (datacheck) { @@ -1263,6 +1259,10 @@ testResult_t run() { PRINT("#\n"); PRINT("# Using devices\n"); + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, multi_iters = %d", pthread_self(), cudaDev, multi_iters); #define MAX_LINE 2048 char line[MAX_LINE]; int len = 0; diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index b022b98..2b336d4 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -70,7 +70,7 @@ int myCallback(int collIdFromCqe, void *args) { pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); ((CallBackArgs *)args)->gotCqe = 1; pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); - OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + // OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); return 0; } @@ -84,7 +84,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa pthread_mutex_init(&args->mutex, NULL); NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args)); - OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); return testSuccess; } From 0bd6d6aa75e4e963085265f80bf6f3843d2ae90a Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 5 Sep 2022 01:55:48 +0000 Subject: [PATCH 024/109] nccl-tests run exactly once --- src/common.cu | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/common.cu b/src/common.cu index 05f814d..72857cd 100644 --- a/src/common.cu +++ b/src/common.cu @@ -596,8 +596,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } // Sync - TESTCHECK(startColl(args, type, op, root, in_place, 0)); - TESTCHECK(completeColl(args)); + // TODO: 之后恢复? + // TESTCHECK(startColl(args, type, op, root, in_place, 0)); + // TESTCHECK(completeColl(args)); Barrier(args); @@ -777,7 +778,8 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* setupArgs(size, type, args); print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); TESTCHECK(BenchTime(args, type, op, root, 0)); - TESTCHECK(BenchTime(args, type, op, root, 1)); + // TODO: 实测是否恢复? + // TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); } return testSuccess; From 9a35e7f214c5eb4e3ed89317a36c292ee0c0980c Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 8 Sep 2022 11:47:15 +0000 Subject: [PATCH 025/109] ad-hoc check --- src_simple/common_simple.cu | 32 +- src_simple/common_simple.cu.pure | 1216 ---------------------------- src_simple/common_simple.cu.simple | 1186 --------------------------- 3 files changed, 17 insertions(+), 2417 deletions(-) delete mode 100644 src_simple/common_simple.cu.pure delete mode 100644 src_simple/common_simple.cu.simple diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 137288c..063664d 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -279,7 +279,9 @@ __device__ double testValue(const size_t offset, const int rep, template <> __device__ float testValue(const size_t offset, const int rep, const int rank) { - return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 + // return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + return 1.0; } template <> __device__ half testValue(const size_t offset, const int rep, @@ -826,20 +828,9 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { - if (multi_iters > 1) { - for (int miter = 0; miter < multi_iters; miter++) { - TESTCHECK(startColl(args, type, op, root, in_place, - iter * multi_iters + miter, miter)); - } - } else { - if (agg_iters > 1) - NCCLCHECK(ncclGroupStart()); - for (int aiter = 0; aiter < agg_iters; aiter++) { - TESTCHECK(startColl(args, type, op, root, in_place, - iter * agg_iters + aiter, 0)); - } - if (agg_iters > 1) - NCCLCHECK(ncclGroupEnd()); + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(startColl(args, type, op, root, in_place, + iter * multi_iters + miter, miter)); } } @@ -863,6 +854,17 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t static __thread int rep = 0; rep++; + // IF_CHECK 如果要检查对错,把下边露出来 + int printNum = 10; + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + float *ptr = (float *)malloc(printNum * sizeof(float)); + cudaMemcpy(ptr, args->recvbuffs[0], printNum * sizeof(float), cudaMemcpyDeviceToHost); + for (int i = 0; i < printNum; i++) { + OFTEST_LOG(TEST, "<%lu> rank=%d, recvbuff[%d]=%f", pthread_self(), cudaDev, i, ptr[i]); + } + free(ptr); + if (datacheck) { // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); diff --git a/src_simple/common_simple.cu.pure b/src_simple/common_simple.cu.pure deleted file mode 100644 index c25c0e3..0000000 --- a/src_simple/common_simple.cu.pure +++ /dev/null @@ -1,1216 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "common_simple.h" -#include -#include -#include -#include -#include "cuda.h" - -int test_ncclVersion = 0; // init'd with ncclGetVersion() - -#if NCCL_MAJOR >= 2 - ncclDataType_t test_types[ncclNumTypes] = { - ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble - #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - , ncclBfloat16 - #endif - }; - const char *test_typenames[ncclNumTypes] = { - "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" - #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - , "bfloat16" - #endif - }; - int test_typenum = -1; - - const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; - ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - , ncclAvg - #endif - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand - #endif - }; - int test_opnum = -1; -#else - ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; - const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; - int test_typenum = 7; - const char *test_opnames[] = {"sum", "prod", "max", "min"}; - ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; - int test_opnum = 4; -#endif - -thread_local int is_main_thread = 0; - -// Command line parameter defaults -static int nThreads = 1; -static int nGpus = 1; -static size_t minBytes = 32*1024*1024; -static size_t maxBytes = 32*1024*1024; -static size_t stepBytes = 1*1024*1024; -static size_t stepFactor = 1; -static int datacheck = 1; -static int warmup_iters = 5; -static int iters = 20; -static int agg_iters = 1; -static int ncclop = ncclSum; -static int nccltype = ncclFloat; -static int ncclroot = 0; -static int parallel_init = 0; -static int blocking_coll = 0; -static int cudaGraphLaunches = 0; -// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) -static int average = 1; - -#define NUM_BLOCKS 32 - -static double parsesize(const char *value) { - long long int units; - double size; - char size_lit; - - int count = sscanf(value, "%lf %1s", &size, &size_lit); - - switch (count) { - case 2: - switch (size_lit) { - case 'G': - case 'g': - units = 1024*1024*1024; - break; - case 'M': - case 'm': - units = 1024*1024; - break; - case 'K': - case 'k': - units = 1024; - break; - default: - return -1.0; - }; - break; - case 1: - units = 1; - break; - default: - return -1.0; - } - - return size * units; -} - -double DeltaMaxValue(ncclDataType_t type) { - switch(type) { - case ncclHalf: return 1e-2; -#if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: return 1e-2; -#endif - case ncclFloat: return 1e-5; - case ncclDouble: return 1e-12; - case ncclInt: -#if NCCL_MAJOR >= 2 - case ncclUint8: - //case ncclInt32: - case ncclUint32: -#endif - case ncclInt64: - case ncclUint64: return 1e-200; - } - return 1e-200; -} - -template __device__ -double absDiff(T a, T b) { - return fabs((double)(b - a)); -} - -template<> __device__ -double absDiff(half a, half b) { - float x = __half2float(a); - float y = __half2float(b); - return fabs((double)(y-x)); -} - -template __device__ -float toFloat(T a) { - return (float)a; -} -template<> __device__ -float toFloat(half a) { - return __half2float(a); -} -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> __device__ -float toFloat(__nv_bfloat16 a) { - return __bfloat162float(a); -} -#endif - -template __global__ -void deltaKern(void* A_, void* B_, size_t count, double* max) { - const T* A = (const T*)A_; - const T* B = (const T*)B_; - __shared__ double temp[BSIZE]; - int tid = blockIdx.x*blockDim.x + threadIdx.x; - double locmax = 0.0; - for(size_t i=tid; i locmax ) { - locmax = delta; -#ifdef DEBUG_PRINT - if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); -#endif - } - } - - tid = threadIdx.x; - temp[tid] = locmax; - for(int stride = BSIZE/2; stride > 1; stride>>=1) { - __syncthreads(); - if( tid < stride ) - temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; - } - __syncthreads(); - if( threadIdx.x == 0) - max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; -} - -testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { - switch (type) { -#if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: - deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; -#endif - case ncclHalf: - deltaKern<<>>(results, expected, count, devmax); break; - case ncclFloat: - deltaKern<<>>(results, expected, count, devmax); break; - case ncclDouble: - deltaKern<<>>(results, expected, count, devmax); break; - - case ncclChar: -#if NCCL_MAJOR >= 2 - case ncclUint8: -#endif - deltaKern<<>>(results, expected, count, devmax); break; - case ncclInt: -#if NCCL_MAJOR >= 2 - case ncclUint32: -#endif - deltaKern<<>>(results, expected, count, devmax); break; - case ncclInt64: - case ncclUint64: - deltaKern<<>>(results, expected, count, devmax); break; - } - CUDACHECK(cudaDeviceSynchronize()); - for (int i=1; i -__device__ T testValue(const size_t offset, const int rep, const int rank) { - uint8_t v = (rep+rank+offset) % 256; - return (T)v; -} - -// For floating point datatype, we use values between 0 and 1 otherwise the -// Product operation will produce NaNs. -template<> -__device__ double testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(double)testValue(offset, rep, rank)); -} -template<> -__device__ float testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(float)testValue(offset, rep, rank)); -} -template<> -__device__ half testValue(const size_t offset, const int rep, const int rank) { - return __float2half(testValue(offset, rep, rank)); -} -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> -__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { - return __float2bfloat16(testValue(offset, rep, rank)); -} -#endif - -// Operations -template -__device__ T ncclOpSum(T a, T b) { return a+b; } -template -__device__ T ncclOpProd(T a, T b) { return a*b; } -template -__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } -template -__device__ T ncclOpMin(T a, T b) { return a -__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } -template<> -__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } -template<> -__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } -template<> -__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } - -template -__device__ T ncclPPOpIdent(T x, int arg) { return x; } -template -__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } -template -__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } -template<> -__device__ half ncclPPOpMul(half x, int arg) { - return __float2half(__half2float(x)*float(arg)); -} -template<> -__device__ half ncclPPOpDiv(half x, int n) { - return __float2half(__half2float(x)/n); -} -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> -__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { - return __float2bfloat16(__bfloat162float(x)*float(arg)); -} -template<> -__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { - return __float2bfloat16(__bfloat162float(x)/n); -} -#endif - -__host__ __device__ int preMulScalar(int rank) { - return 1 + rank%2; -} - -template -__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); - val = PreOp(val, preMulScalar(0)); - for (int i=1; i(o+offset, rep, i); - val1 = PreOp(val1, preMulScalar(i)); - val = Op(val, val1); - } - data[o] = PostOp(val, nranks); - } -} - -#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ - KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) -#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) -#else - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) -#endif - -static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { - OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - OPS(__nv_bfloat16) -#endif -}; - -testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; - CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); - return testSuccess; -} - -template -__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); -} - -static void* const initDataKerns[ncclNumTypes] = { - (void*)InitDataKernel< int8_t>, - (void*)InitDataKernel< uint8_t>, - (void*)InitDataKernel< int32_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< int64_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< half>, - (void*)InitDataKernel< float>, - (void*)InitDataKernel< double>, -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - (void*)InitDataKernel<__nv_bfloat16> -#endif -}; - -template -testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { - T* ptr = (T*)dest; - InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); - return testSuccess; -} - -testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; - CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); - return testSuccess; -} - -void Barrier(struct threadArgs* args) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); - args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - MPI_Barrier(MPI_COMM_WORLD); -#endif - args->barrier[args->barrier_idx] = 0; - } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); - } - args->barrier_idx=!args->barrier_idx; -} - -// Inter-thread/process barrier+allreduce -void Allreduce(struct threadArgs* args, double* value, int average) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); - double val = *value; - if (args->thread > 0) { - double val2 = args->reduce[args->barrier_idx]; - if (average == 1) val += val2; - if (average == 2) val = std::min(val, val2); - if (average == 3) val = std::max(val, val2); - } - if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; - args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - if (average != 0) { - MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; - MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); - } -#endif - if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; - args->reduce[1-args->barrier_idx] = 0; - args->barrier[args->barrier_idx] = 0; - } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); - } - *value = args->reduce[args->barrier_idx]; - args->barrier_idx=!args->barrier_idx; -} - -testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { - size_t count = args->expectedBytes/wordSize(type); - double maxDelta = 0.0; - for (int i=0; inGpus; i++) { - int device; - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); - void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; - TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); - maxDelta = std::max(*(args->deltaHost), maxDelta); - -#ifdef DEBUG_PRINT - if (rank == 0) { - int *expectedHost = (int *)malloc(args->expectedBytes); - int *dataHost = (int *)malloc(args->expectedBytes); - - cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); - printf("\n Expected: "); - for(int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, expectedHost[j]); - } - printf("\n"); - - cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); - printf("\n Actual: "); - for (int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, dataHost[j]); - } - printf("\n"); - free(expectedHost); - free(dataHost); - } -#endif - } - double nranks = args->nProcs*args->nThreads*args->nGpus; - if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; - *delta = maxDelta; - return testSuccess; -} - -testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { - cudaError_t cudaErr; - int remaining = ngpus; - int* done = (int*)malloc(sizeof(int)*ngpus); - memset(done, 0, sizeof(int)*ngpus); - while (remaining) { - int idle = 1; - for (int i=0; i= NCCL_VERSION(2,4,0) - if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { - ncclResult_t ncclAsyncErr; - NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); - if (ncclAsyncErr != ncclSuccess) { - // An asynchronous error happened. Stop the operation and destroy - // the communicator - for (int i=0; inbytes / wordSize(type); - - // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange - size_t totalnbytes = max(args->sendBytes, args->expectedBytes); - size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; - size_t shift = totalnbytes * (iter % steps); - - if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); - for (int i = 0; i < args->nGpus; i++) { -#ifndef NCCL_MAJOR - int cudaDev; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); - CUDACHECK(cudaSetDevice(cudaDev)); -#endif - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - char* recvBuff = ((char*)args->recvbuffs[i]) + shift; - char* sendBuff = ((char*)args->sendbuffs[i]) + shift; - ncclRedOp_t op; - - if(opIndex < ncclNumOps) { - op = opIndex; - } - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - else { - union { - int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; - half f16; float f32; double f64; - #if defined(__CUDA_BF16_TYPES_EXIST__) - __nv_bfloat16 bf16; - #endif - }; - int scalar = preMulScalar(rank); - switch(type) { - case ncclInt8: i8 = int8_t(scalar); break; - case ncclUint8: u8 = uint8_t(scalar); break; - case ncclInt32: i32 = int32_t(scalar); break; - case ncclUint32: u32 = uint32_t(scalar); break; - case ncclInt64: i64 = int32_t(scalar); break; - case ncclUint64: u64 = uint32_t(scalar); break; - case ncclFloat16: f16 = __float2half(float(scalar)); break; - case ncclFloat32: f32 = float(scalar); break; - case ncclFloat64: f64 = double(scalar); break; - #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; - #endif - } - NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); - } - #endif - - TESTCHECK(args->collTest->runColl( - (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), - (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), - count, type, op, root, args->comms[i], args->streams[i])); - - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - if(opIndex >= ncclNumOps) { - NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); - } - #endif - } - if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd()); - - if (blocking_coll) { - // Complete op before returning - TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); - } - if (blocking_coll) Barrier(args); - return testSuccess; -} - -testResult_t completeColl(struct threadArgs* args) { - if (blocking_coll) return testSuccess; - - TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); - return testSuccess; -} - -testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { - size_t count = args->nbytes / wordSize(type); - if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); - } - - // Sync - TESTCHECK(startColl(args, type, op, root, in_place, 0)); - TESTCHECK(completeColl(args)); - - Barrier(args); - -#if CUDART_VERSION >= 11030 - cudaGraph_t graphs[args->nGpus]; - cudaGraphExec_t graphExec[args->nGpus]; - if (cudaGraphLaunches >= 1) { - // Begin cuda graph capture - for (int i=0; inGpus; i++) { - // Thread local mode is needed for: - // - Multi-thread mode - // - P2P pre-connect - CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); - } - } -#endif - - // Performance Benchmark - auto start = std::chrono::high_resolution_clock::now(); - for (int iter = 0; iter < iters; iter++) { - if (agg_iters>1) NCCLCHECK(ncclGroupStart()); - for (int aiter = 0; aiter < agg_iters; aiter++) { - TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); - } - if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); - } - -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - // End cuda graph capture - for (int i=0; inGpus; i++) { - CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); - } - // Instantiate cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); - } - // Resync CPU, restart timing, launch cuda graph - Barrier(args); - start = std::chrono::high_resolution_clock::now(); - for (int l=0; lnGpus; i++) { - CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); - } - } - } -#endif - - TESTCHECK(completeColl(args)); - - auto delta = std::chrono::high_resolution_clock::now() - start; - double deltaSec = std::chrono::duration_cast>(delta).count(); - deltaSec = deltaSec/(iters*agg_iters); - if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; - Allreduce(args, &deltaSec, average); - -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - //destroy cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphExecDestroy(graphExec[i])); - CUDACHECK(cudaGraphDestroy(graphs[i])); - } - } -#endif - - double algBw, busBw; - args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); - - Barrier(args); - - double maxDelta = 0; - static __thread int rep = 0; - rep++; - if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); - -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - // Begin cuda graph capture for data check - for (int i=0; inGpus; i++) { - CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); - } - } -#endif - - //test validation in single itertion, should ideally be included into the multi-iteration run - TESTCHECK(startColl(args, type, op, root, in_place, 0)); - -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - // End cuda graph capture - for (int i=0; inGpus; i++) { - CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); - } - // Instantiate cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); - } - // Launch cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); - } - } -#endif - - TESTCHECK(completeColl(args)); - -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - //destroy cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphExecDestroy(graphExec[i])); - CUDACHECK(cudaGraphDestroy(graphs[i])); - } - } -#endif - - TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); - - //aggregate delta from all threads and procs - Allreduce(args, &maxDelta, 3); - } - - double timeUsec = deltaSec*1.0E6; - char timeStr[100]; - if (timeUsec >= 10000.0) { - sprintf(timeStr, "%7.0f", timeUsec); - } else if (timeUsec >= 100.0) { - sprintf(timeStr, "%7.1f", timeUsec); - } else { - sprintf(timeStr, "%7.2f", timeUsec); - } - if (datacheck) { - PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); - } else { - PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); - } - - args->bw[0] += busBw; - args->bw_count[0]++; - return testSuccess; -} - -void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { - int nranks = args->nProcs*args->nGpus*args->nThreads; - size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; - - count = size / wordSize(type); - args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); - - args->nbytes = paramCount * wordSize(type); - args->sendBytes = sendCount * wordSize(type); - args->expectedBytes = recvCount * wordSize(type); - args->sendInplaceOffset = sendInplaceOffset * wordSize(type); - args->recvInplaceOffset = recvInplaceOffset * wordSize(type); -} - -testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { - // Warm-up for large size - setupArgs(args->maxbytes, type, args); - for (int iter = 0; iter < warmup_iters; iter++) { - TESTCHECK(startColl(args, type, op, root, 0, iter)); - } - TESTCHECK(completeColl(args)); - - // Warm-up for small size - setupArgs(args->minbytes, type, args); - for (int iter = 0; iter < warmup_iters; iter++) { - TESTCHECK(startColl(args, type, op, root, 0, iter)); - } - TESTCHECK(completeColl(args)); - - // Benchmark - for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { - setupArgs(size, type, args); - print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); - TESTCHECK(BenchTime(args, type, op, root, 0)); - TESTCHECK(BenchTime(args, type, op, root, 1)); - PRINT("\n"); - } - return testSuccess; -} - -testResult_t threadRunTests(struct threadArgs* args) { - // Set device to the first of our GPUs. If we don't do that, some operations - // will be done on the current GPU (by default : 0) and if the GPUs are in - // exclusive mode those operations will fail. - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; - CUDACHECK(cudaSetDevice(gpuid)); - TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); - return testSuccess; -} - -testResult_t threadInit(struct threadArgs* args) { - char hostname[1024]; - getHostName(hostname, 1024); - int nranks = args->nProcs*args->nThreads*args->nGpus; - - //set main thread again - is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; - - NCCLCHECK(ncclGroupStart()); - for (int i=0; inGpus; i++) { - int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); - NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); - } - NCCLCHECK(ncclGroupEnd()); - - TESTCHECK(threadRunTests(args)); - - for (int i=0; inGpus; i++) { - NCCLCHECK(ncclCommDestroy(args->comms[i])); - } - return testSuccess; -} - -void* threadLauncher(void* thread_) { - struct testThread* thread = (struct testThread*)thread_; - thread->ret = thread->func(&thread->args); - return NULL; -} -testResult_t threadLaunch(struct testThread* thread) { - pthread_create(&thread->thread, NULL, threadLauncher, thread); - return testSuccess; -} - -testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { - CUDACHECK(cudaMalloc(sendbuff, nbytes)); - CUDACHECK(cudaMalloc(recvbuff, nbytes)); - if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); - return testSuccess; -} - -testResult_t run(); // Main function - -int main(int argc, char* argv[]) { - // Make sure everyline is flushed so that we see the progress of the test - setlinebuf(stdout); - - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) - ncclGetVersion(&test_ncclVersion); - #else - test_ncclVersion = NCCL_VERSION_CODE; - #endif - //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) - test_opnum = 4; - test_typenum = 9; - if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { - test_opnum++; // ncclAvg - #if defined(__CUDA_BF16_TYPES_EXIST__) - test_typenum++; // bfloat16 - #endif - } - if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { - test_opnum++; // PreMulSum - } - #endif - - // Parse args - double parsed; - int longindex; - static struct option longopts[] = { - {"nthreads", required_argument, 0, 't'}, - {"ngpus", required_argument, 0, 'g'}, - {"minbytes", required_argument, 0, 'b'}, - {"maxbytes", required_argument, 0, 'e'}, - {"stepbytes", required_argument, 0, 'i'}, - {"stepfactor", required_argument, 0, 'f'}, - {"iters", required_argument, 0, 'n'}, - {"agg_iters", required_argument, 0, 'm'}, - {"warmup_iters", required_argument, 0, 'w'}, - {"parallel_init", required_argument, 0, 'p'}, - {"check", required_argument, 0, 'c'}, - {"op", required_argument, 0, 'o'}, - {"datatype", required_argument, 0, 'd'}, - {"root", required_argument, 0, 'r'}, - {"blocking", required_argument, 0, 'z'}, - {"cudagraph", required_argument, 0, 'G'}, - {"average", required_argument, 0, 'a'}, - {"help", no_argument, 0, 'h'}, - {} - }; - - while(1) { - int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); - - if (c == -1) - break; - - switch(c) { - case 't': - nThreads = strtol(optarg, NULL, 0); - break; - case 'g': - nGpus = strtol(optarg, NULL, 0); - break; - case 'b': - parsed = parsesize(optarg); - if (parsed < 0) { - fprintf(stderr, "invalid size specified for 'minbytes'\n"); - return -1; - } - minBytes = (size_t)parsed; - break; - case 'e': - parsed = parsesize(optarg); - if (parsed < 0) { - fprintf(stderr, "invalid size specified for 'maxbytes'\n"); - return -1; - } - maxBytes = (size_t)parsed; - break; - case 'i': - stepBytes = strtol(optarg, NULL, 0); - break; - case 'f': - stepFactor = strtol(optarg, NULL, 0); - break; - case 'n': - iters = (int)strtol(optarg, NULL, 0); - break; - case 'm': -#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) - agg_iters = (int)strtol(optarg, NULL, 0); -#else - fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); -#endif - break; - case 'w': - warmup_iters = (int)strtol(optarg, NULL, 0); - break; - case 'c': - datacheck = (int)strtol(optarg, NULL, 0); - break; - case 'p': - parallel_init = (int)strtol(optarg, NULL, 0); - break; - case 'o': - ncclop = ncclstringtoop(optarg); - break; - case 'd': - nccltype = ncclstringtotype(optarg); - break; - case 'r': - ncclroot = strtol(optarg, NULL, 0); - break; - case 'z': - blocking_coll = strtol(optarg, NULL, 0); - break; - case 'G': -#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 - cudaGraphLaunches = strtol(optarg, NULL, 0); -#else - printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); -#endif - break; - case 'a': - average = (int)strtol(optarg, NULL, 0); - break; - case 'h': - default: - if (c != 'h') printf("invalid option '%c'\n", c); - printf("USAGE: %s \n\t" - "[-t,--nthreads ] \n\t" - "[-g,--ngpus ] \n\t" - "[-b,--minbytes ] \n\t" - "[-e,--maxbytes ] \n\t" - "[-i,--stepbytes ] \n\t" - "[-f,--stepfactor ] \n\t" - "[-n,--iters ] \n\t" - "[-m,--agg_iters ] \n\t" - "[-w,--warmup_iters ] \n\t" - "[-p,--parallel_init <0/1>] \n\t" - "[-c,--check <0/1>] \n\t" -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - "[-o,--op ] \n\t" -#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - "[-o,--op ] \n\t" -#else - "[-o,--op ] \n\t" -#endif - "[-d,--datatype ] \n\t" - "[-r,--root ] \n\t" - "[-z,--blocking <0/1>] \n\t" - "[-G,--cudagraph ] \n\t" - "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" - "[-h,--help]\n", - basename(argv[0])); - return 0; - } - } - if (minBytes > maxBytes) { - fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", - (unsigned long long)minBytes, - (unsigned long long)maxBytes); - return -1; - } -#ifdef MPI_SUPPORT - MPI_Init(&argc, &argv); -#endif - TESTCHECK(run()); - return 0; -} - -testResult_t run() { - int nProcs = 1, proc = 0; - int localRank = 0; - char hostname[1024]; - getHostName(hostname, 1024); - -#ifdef MPI_SUPPORT - MPI_Comm_size(MPI_COMM_WORLD, &nProcs); - MPI_Comm_rank(MPI_COMM_WORLD, &proc); - uint64_t hostHashs[nProcs]; - hostHashs[proc] = getHostHash(hostname); - MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); - for (int p=0; p 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); - if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); - if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); - PRINT("#\n"); - - PRINT("# Using devices\n"); -#define MAX_LINE 2048 - char line[MAX_LINE]; - int len = 0; - size_t maxMem = ~0; - for (int i=0; i memMaxBytes) { - maxBytes = memMaxBytes; - if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); - } - - ncclUniqueId ncclId; - if (proc == 0) { - NCCLCHECK(ncclGetUniqueId(&ncclId)); - } -#ifdef MPI_SUPPORT - MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); -#endif - cudaStream_t streams[nGpus*nThreads]; - void* sendbuffs[nGpus*nThreads]; - void* recvbuffs[nGpus*nThreads]; - void* expected[nGpus*nThreads]; - size_t sendBytes, recvBytes; - - ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); - - for (int i=0; i=0; t--) { - threads[t].args.minbytes=minBytes; - threads[t].args.maxbytes=maxBytes; - threads[t].args.stepbytes=stepBytes; - threads[t].args.stepfactor=stepFactor; - threads[t].args.localRank = localRank; - - threads[t].args.nProcs=nProcs; - threads[t].args.proc=proc; - threads[t].args.nThreads=nThreads; - threads[t].args.thread=t; - threads[t].args.nGpus=nGpus; - threads[t].args.sendbuffs = sendbuffs+t*nGpus; - threads[t].args.recvbuffs = recvbuffs+t*nGpus; - threads[t].args.expected = expected+t*nGpus; - threads[t].args.ncclId = ncclId; - threads[t].args.comms=comms+t*nGpus; - threads[t].args.streams=streams+t*nGpus; - - threads[t].args.barrier = (volatile int*)barrier; - threads[t].args.barrier_idx = 0; - threads[t].args.reduce = (volatile double*)reduce; - threads[t].args.sync = (volatile int*)sync; - threads[t].args.sync_idx = 0; - threads[t].args.deltaHost = (delta + t*NUM_BLOCKS); - threads[t].args.errors=errors+t; - threads[t].args.bw=bw+t; - threads[t].args.bw_count=bw_count+t; - - threads[t].args.reportErrors = 1; - - threads[t].func = parallel_init ? threadInit : threadRunTests; - if (t) - TESTCHECK(threadLaunch(threads+t)); - else - TESTCHECK(threads[t].func(&threads[t].args)); - } - - // Wait for other threads and accumulate stats and errors - for (int t=nThreads-1; t>=0; t--) { - if (t) pthread_join(threads[t].thread, NULL); - TESTCHECK(threads[t].ret); - if (t) { - errors[0] += errors[t]; - bw[0] += bw[t]; - bw_count[0] += bw_count[t]; - } - } - -#ifdef MPI_SUPPORT - MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -#endif - - if (!parallel_init) { - for(int i=0; i -#include -#include -#include -#include "cuda.h" - -int test_ncclVersion = 0; // init'd with ncclGetVersion() - -#if NCCL_MAJOR >= 2 - ncclDataType_t test_types[ncclNumTypes] = { - ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble - #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - , ncclBfloat16 - #endif - }; - const char *test_typenames[ncclNumTypes] = { - "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" - #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - , "bfloat16" - #endif - }; - int test_typenum = -1; - - const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; - ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - , ncclAvg - #endif - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand - #endif - }; - int test_opnum = -1; -#else - ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; - const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; - int test_typenum = 7; - const char *test_opnames[] = {"sum", "prod", "max", "min"}; - ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; - int test_opnum = 4; -#endif - -thread_local int is_main_thread = 0; - -// Command line parameter defaults -static int nThreads = 1; -static int nGpus = 1; -static size_t minBytes = 32*1024*1024; -static size_t maxBytes = 32*1024*1024; -static size_t stepBytes = 1*1024*1024; -static size_t stepFactor = 1; -static int datacheck = 1; -static int warmup_iters = 5; -static int iters = 20; -static int agg_iters = 1; -static int ncclop = ncclSum; -static int nccltype = ncclFloat; -static int ncclroot = 0; -static int parallel_init = 0; -static int blocking_coll = 0; -static int cudaGraphLaunches = 0; -// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) -static int average = 1; - -#define NUM_BLOCKS 32 - -static double parsesize(const char *value) { - long long int units; - double size; - char size_lit; - - int count = sscanf(value, "%lf %1s", &size, &size_lit); - - switch (count) { - case 2: - switch (size_lit) { - case 'G': - case 'g': - units = 1024*1024*1024; - break; - case 'M': - case 'm': - units = 1024*1024; - break; - case 'K': - case 'k': - units = 1024; - break; - default: - return -1.0; - }; - break; - case 1: - units = 1; - break; - default: - return -1.0; - } - - return size * units; -} - -double DeltaMaxValue(ncclDataType_t type) { - switch(type) { - case ncclHalf: return 1e-2; -#if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: return 1e-2; -#endif - case ncclFloat: return 1e-5; - case ncclDouble: return 1e-12; - case ncclInt: -#if NCCL_MAJOR >= 2 - case ncclUint8: - //case ncclInt32: - case ncclUint32: -#endif - case ncclInt64: - case ncclUint64: return 1e-200; - } - return 1e-200; -} - -template __device__ -double absDiff(T a, T b) { - return fabs((double)(b - a)); -} - -template<> __device__ -double absDiff(half a, half b) { - float x = __half2float(a); - float y = __half2float(b); - return fabs((double)(y-x)); -} - -template __device__ -float toFloat(T a) { - return (float)a; -} -template<> __device__ -float toFloat(half a) { - return __half2float(a); -} -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> __device__ -float toFloat(__nv_bfloat16 a) { - return __bfloat162float(a); -} -#endif - -template __global__ -void deltaKern(void* A_, void* B_, size_t count, double* max) { - const T* A = (const T*)A_; - const T* B = (const T*)B_; - __shared__ double temp[BSIZE]; - int tid = blockIdx.x*blockDim.x + threadIdx.x; - double locmax = 0.0; - for(size_t i=tid; i locmax ) { - locmax = delta; -#ifdef DEBUG_PRINT - if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); -#endif - } - } - - tid = threadIdx.x; - temp[tid] = locmax; - for(int stride = BSIZE/2; stride > 1; stride>>=1) { - __syncthreads(); - if( tid < stride ) - temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; - } - __syncthreads(); - if( threadIdx.x == 0) - max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; -} - -testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { - switch (type) { -#if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: - deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; -#endif - case ncclHalf: - deltaKern<<>>(results, expected, count, devmax); break; - case ncclFloat: - deltaKern<<>>(results, expected, count, devmax); break; - case ncclDouble: - deltaKern<<>>(results, expected, count, devmax); break; - - case ncclChar: -#if NCCL_MAJOR >= 2 - case ncclUint8: -#endif - deltaKern<<>>(results, expected, count, devmax); break; - case ncclInt: -#if NCCL_MAJOR >= 2 - case ncclUint32: -#endif - deltaKern<<>>(results, expected, count, devmax); break; - case ncclInt64: - case ncclUint64: - deltaKern<<>>(results, expected, count, devmax); break; - } - CUDACHECK(cudaDeviceSynchronize()); - for (int i=1; i -__device__ T testValue(const size_t offset, const int rep, const int rank) { - uint8_t v = (rep+rank+offset) % 256; - return (T)v; -} - -// For floating point datatype, we use values between 0 and 1 otherwise the -// Product operation will produce NaNs. -template<> -__device__ double testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(double)testValue(offset, rep, rank)); -} -template<> -__device__ float testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(float)testValue(offset, rep, rank)); -} -template<> -__device__ half testValue(const size_t offset, const int rep, const int rank) { - return __float2half(testValue(offset, rep, rank)); -} -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> -__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { - return __float2bfloat16(testValue(offset, rep, rank)); -} -#endif - -// Operations -template -__device__ T ncclOpSum(T a, T b) { return a+b; } -template -__device__ T ncclOpProd(T a, T b) { return a*b; } -template -__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } -template -__device__ T ncclOpMin(T a, T b) { return a -__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } -template<> -__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } -template<> -__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } -template<> -__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } - -template -__device__ T ncclPPOpIdent(T x, int arg) { return x; } -template -__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } -template -__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } -template<> -__device__ half ncclPPOpMul(half x, int arg) { - return __float2half(__half2float(x)*float(arg)); -} -template<> -__device__ half ncclPPOpDiv(half x, int n) { - return __float2half(__half2float(x)/n); -} -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> -__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { - return __float2bfloat16(__bfloat162float(x)*float(arg)); -} -template<> -__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { - return __float2bfloat16(__bfloat162float(x)/n); -} -#endif - -__host__ __device__ int preMulScalar(int rank) { - return 1 + rank%2; -} - -template -__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); - val = PreOp(val, preMulScalar(0)); - for (int i=1; i(o+offset, rep, i); - val1 = PreOp(val1, preMulScalar(i)); - val = Op(val, val1); - } - data[o] = PostOp(val, nranks); - } -} - -#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ - KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) -#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) -#else - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) -#endif - -static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { - OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - OPS(__nv_bfloat16) -#endif -}; - -testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; - CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); - return testSuccess; -} - -template -__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); -} - -static void* const initDataKerns[ncclNumTypes] = { - (void*)InitDataKernel< int8_t>, - (void*)InitDataKernel< uint8_t>, - (void*)InitDataKernel< int32_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< int64_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< half>, - (void*)InitDataKernel< float>, - (void*)InitDataKernel< double>, -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - (void*)InitDataKernel<__nv_bfloat16> -#endif -}; - -template -testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { - T* ptr = (T*)dest; - InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); - return testSuccess; -} - -testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; - CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); - return testSuccess; -} - -void Barrier(struct threadArgs* args) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); - args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - MPI_Barrier(MPI_COMM_WORLD); -#endif - args->barrier[args->barrier_idx] = 0; - } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); - } - args->barrier_idx=!args->barrier_idx; -} - -// Inter-thread/process barrier+allreduce -void Allreduce(struct threadArgs* args, double* value, int average) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); - double val = *value; - if (args->thread > 0) { - double val2 = args->reduce[args->barrier_idx]; - if (average == 1) val += val2; - if (average == 2) val = std::min(val, val2); - if (average == 3) val = std::max(val, val2); - } - if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; - args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - if (average != 0) { - MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; - MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); - } -#endif - if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; - args->reduce[1-args->barrier_idx] = 0; - args->barrier[args->barrier_idx] = 0; - } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); - } - *value = args->reduce[args->barrier_idx]; - args->barrier_idx=!args->barrier_idx; -} - -testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { - size_t count = args->expectedBytes/wordSize(type); - double maxDelta = 0.0; - for (int i=0; inGpus; i++) { - int device; - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); - void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; - TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); - maxDelta = std::max(*(args->deltaHost), maxDelta); - -#ifdef DEBUG_PRINT - if (rank == 0) { - int *expectedHost = (int *)malloc(args->expectedBytes); - int *dataHost = (int *)malloc(args->expectedBytes); - - cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); - printf("\n Expected: "); - for(int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, expectedHost[j]); - } - printf("\n"); - - cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); - printf("\n Actual: "); - for (int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, dataHost[j]); - } - printf("\n"); - free(expectedHost); - free(dataHost); - } -#endif - } - double nranks = args->nProcs*args->nThreads*args->nGpus; - if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; - *delta = maxDelta; - return testSuccess; -} - -testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { - cudaError_t cudaErr; - int remaining = ngpus; - int* done = (int*)malloc(sizeof(int)*ngpus); - memset(done, 0, sizeof(int)*ngpus); - while (remaining) { - int idle = 1; - for (int i=0; i= NCCL_VERSION(2,4,0) - if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { - ncclResult_t ncclAsyncErr; - NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); - if (ncclAsyncErr != ncclSuccess) { - // An asynchronous error happened. Stop the operation and destroy - // the communicator - for (int i=0; inbytes / wordSize(type); - - // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange - size_t totalnbytes = max(args->sendBytes, args->expectedBytes); - size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; - size_t shift = totalnbytes * (iter % steps); - - if (args->nGpus > 1) { - // printf("startColl, args->nGpus > 1 run ncclGroupStart\n"); - NCCLCHECK(ncclGroupStart()); - } - for (int i = 0; i < args->nGpus; i++) { -#ifndef NCCL_MAJOR - int cudaDev; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); - CUDACHECK(cudaSetDevice(cudaDev)); -#endif - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - char* recvBuff = ((char*)args->recvbuffs[i]) + shift; - char* sendBuff = ((char*)args->sendbuffs[i]) + shift; - ncclRedOp_t op; - - if(opIndex < ncclNumOps) { - op = opIndex; - } - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - else { - union { - int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; - half f16; float f32; double f64; - #if defined(__CUDA_BF16_TYPES_EXIST__) - __nv_bfloat16 bf16; - #endif - }; - int scalar = preMulScalar(rank); - switch(type) { - case ncclInt8: i8 = int8_t(scalar); break; - case ncclUint8: u8 = uint8_t(scalar); break; - case ncclInt32: i32 = int32_t(scalar); break; - case ncclUint32: u32 = uint32_t(scalar); break; - case ncclInt64: i64 = int32_t(scalar); break; - case ncclUint64: u64 = uint32_t(scalar); break; - case ncclFloat16: f16 = __float2half(float(scalar)); break; - case ncclFloat32: f32 = float(scalar); break; - case ncclFloat64: f64 = double(scalar); break; - #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; - #endif - } - NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); - } - #endif - - TESTCHECK(args->collTest->runColl( - (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), - (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), - count, type, op, root, args->comms[i], args->streams[i])); - - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - if(opIndex >= ncclNumOps) { - NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); - } - #endif - } - if (args->nGpus > 1) { - // printf("startColl, args->nGpus > 1 run ncclGroupEnd\n"); - NCCLCHECK(ncclGroupEnd()); - } - - if (blocking_coll) { - // Complete op before returning - TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); - } - if (blocking_coll) Barrier(args); - return testSuccess; -} - -testResult_t completeColl(struct threadArgs* args) { - if (blocking_coll) return testSuccess; - - TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); - return testSuccess; -} - -testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { - size_t count = args->nbytes / wordSize(type); - if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); - } - - // Sync - TESTCHECK(startColl(args, type, op, root, in_place, 0)); - TESTCHECK(completeColl(args)); - - Barrier(args); - -#if CUDART_VERSION >= 11030 - cudaGraph_t graphs[args->nGpus]; - cudaGraphExec_t graphExec[args->nGpus]; - if (cudaGraphLaunches >= 1) { - // Begin cuda graph capture - for (int i=0; inGpus; i++) { - // Thread local mode is needed for: - // - Multi-thread mode - // - P2P pre-connect - CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); - } - } -#endif - - // Performance Benchmark - auto start = std::chrono::high_resolution_clock::now(); - for (int iter = 0; iter < iters; iter++) { - if (agg_iters>1) NCCLCHECK(ncclGroupStart()); - for (int aiter = 0; aiter < agg_iters; aiter++) { - TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); - } - if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); - } - -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// // End cuda graph capture -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); -// } -// // Instantiate cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); -// } -// // Resync CPU, restart timing, launch cuda graph -// Barrier(args); -// start = std::chrono::high_resolution_clock::now(); -// for (int l=0; lnGpus; i++) { -// CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); -// } -// } -// } -// #endif - - TESTCHECK(completeColl(args)); - - auto delta = std::chrono::high_resolution_clock::now() - start; - double deltaSec = std::chrono::duration_cast>(delta).count(); - deltaSec = deltaSec/(iters*agg_iters); - if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; - Allreduce(args, &deltaSec, average); - -// #if CUDART_VERSION >= 11030 -// if (cudaGraphLaunches >= 1) { -// //destroy cuda graph -// for (int i=0; inGpus; i++) { -// CUDACHECK(cudaGraphExecDestroy(graphExec[i])); -// CUDACHECK(cudaGraphDestroy(graphs[i])); -// } -// } -// #endif - - double algBw, busBw; - args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); - - Barrier(args); - - double maxDelta = 0; - static __thread int rep = 0; - rep++; - if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); - - //test validation in single itertion, should ideally be included into the multi-iteration run - TESTCHECK(startColl(args, type, op, root, in_place, 0)); - - TESTCHECK(completeColl(args)); - - TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); - - //aggregate delta from all threads and procs - Allreduce(args, &maxDelta, 3); - } - - double timeUsec = deltaSec*1.0E6; - char timeStr[100]; - if (timeUsec >= 10000.0) { - sprintf(timeStr, "%7.0f", timeUsec); - } else if (timeUsec >= 100.0) { - sprintf(timeStr, "%7.1f", timeUsec); - } else { - sprintf(timeStr, "%7.2f", timeUsec); - } - if (datacheck) { - PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); - } else { - PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); - } - - args->bw[0] += busBw; - args->bw_count[0]++; - return testSuccess; -} - -void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { - int nranks = args->nProcs*args->nGpus*args->nThreads; - size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; - - count = size / wordSize(type); - args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); - - args->nbytes = paramCount * wordSize(type); - args->sendBytes = sendCount * wordSize(type); - args->expectedBytes = recvCount * wordSize(type); - args->sendInplaceOffset = sendInplaceOffset * wordSize(type); - args->recvInplaceOffset = recvInplaceOffset * wordSize(type); -} - -testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { - // // Warm-up for large size - // setupArgs(args->maxbytes, type, args); - // for (int iter = 0; iter < warmup_iters; iter++) { - // TESTCHECK(startColl(args, type, op, root, 0, iter)); - // } - // TESTCHECK(completeColl(args)); - - // // Warm-up for small size - // setupArgs(args->minbytes, type, args); - // for (int iter = 0; iter < warmup_iters; iter++) { - // TESTCHECK(startColl(args, type, op, root, 0, iter)); - // } - // TESTCHECK(completeColl(args)); - - // Benchmark - for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { - setupArgs(size, type, args); - print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); - TESTCHECK(BenchTime(args, type, op, root, 0)); - // TESTCHECK(BenchTime(args, type, op, root, 1)); - PRINT("\n"); - } - return testSuccess; -} - -testResult_t threadRunTests(struct threadArgs* args) { - // Set device to the first of our GPUs. If we don't do that, some operations - // will be done on the current GPU (by default : 0) and if the GPUs are in - // exclusive mode those operations will fail. - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; - CUDACHECK(cudaSetDevice(gpuid)); - TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); - return testSuccess; -} - -testResult_t threadInit(struct threadArgs* args) { - char hostname[1024]; - getHostName(hostname, 1024); - int nranks = args->nProcs*args->nThreads*args->nGpus; - - //set main thread again - is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; - - NCCLCHECK(ncclGroupStart()); - for (int i=0; inGpus; i++) { - int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); - NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); - } - NCCLCHECK(ncclGroupEnd()); - - TESTCHECK(threadRunTests(args)); - - for (int i=0; inGpus; i++) { - NCCLCHECK(ncclCommDestroy(args->comms[i])); - } - return testSuccess; -} - -void* threadLauncher(void* thread_) { - struct testThread* thread = (struct testThread*)thread_; - thread->ret = thread->func(&thread->args); - return NULL; -} -testResult_t threadLaunch(struct testThread* thread) { - pthread_create(&thread->thread, NULL, threadLauncher, thread); - return testSuccess; -} - -testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { - CUDACHECK(cudaMalloc(sendbuff, nbytes)); - CUDACHECK(cudaMalloc(recvbuff, nbytes)); - if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); - return testSuccess; -} - -testResult_t run(); // Main function - -int main(int argc, char* argv[]) { - // Make sure everyline is flushed so that we see the progress of the test - setlinebuf(stdout); - - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) - ncclGetVersion(&test_ncclVersion); - #else - test_ncclVersion = NCCL_VERSION_CODE; - #endif - //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) - test_opnum = 4; - test_typenum = 9; - if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { - test_opnum++; // ncclAvg - #if defined(__CUDA_BF16_TYPES_EXIST__) - test_typenum++; // bfloat16 - #endif - } - if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { - test_opnum++; // PreMulSum - } - #endif - - // Parse args - double parsed; - int longindex; - static struct option longopts[] = { - {"nthreads", required_argument, 0, 't'}, - {"ngpus", required_argument, 0, 'g'}, - {"minbytes", required_argument, 0, 'b'}, - {"maxbytes", required_argument, 0, 'e'}, - {"stepbytes", required_argument, 0, 'i'}, - {"stepfactor", required_argument, 0, 'f'}, - {"iters", required_argument, 0, 'n'}, - {"agg_iters", required_argument, 0, 'm'}, - {"warmup_iters", required_argument, 0, 'w'}, - {"parallel_init", required_argument, 0, 'p'}, - {"check", required_argument, 0, 'c'}, - {"op", required_argument, 0, 'o'}, - {"datatype", required_argument, 0, 'd'}, - {"root", required_argument, 0, 'r'}, - {"blocking", required_argument, 0, 'z'}, - {"cudagraph", required_argument, 0, 'G'}, - {"average", required_argument, 0, 'a'}, - {"help", no_argument, 0, 'h'}, - {} - }; - - while(1) { - int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); - - if (c == -1) - break; - - switch(c) { - case 't': - nThreads = strtol(optarg, NULL, 0); - break; - case 'g': - nGpus = strtol(optarg, NULL, 0); - break; - case 'b': - parsed = parsesize(optarg); - if (parsed < 0) { - fprintf(stderr, "invalid size specified for 'minbytes'\n"); - return -1; - } - minBytes = (size_t)parsed; - break; - case 'e': - parsed = parsesize(optarg); - if (parsed < 0) { - fprintf(stderr, "invalid size specified for 'maxbytes'\n"); - return -1; - } - maxBytes = (size_t)parsed; - break; - case 'i': - stepBytes = strtol(optarg, NULL, 0); - break; - case 'f': - stepFactor = strtol(optarg, NULL, 0); - break; - case 'n': - iters = (int)strtol(optarg, NULL, 0); - break; - case 'm': -#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) - agg_iters = (int)strtol(optarg, NULL, 0); -#else - fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); -#endif - break; - case 'w': - warmup_iters = (int)strtol(optarg, NULL, 0); - break; - case 'c': - datacheck = (int)strtol(optarg, NULL, 0); - break; - case 'p': - parallel_init = (int)strtol(optarg, NULL, 0); - break; - case 'o': - ncclop = ncclstringtoop(optarg); - break; - case 'd': - nccltype = ncclstringtotype(optarg); - break; - case 'r': - ncclroot = strtol(optarg, NULL, 0); - break; - case 'z': - blocking_coll = strtol(optarg, NULL, 0); - break; - case 'G': -#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 - cudaGraphLaunches = strtol(optarg, NULL, 0); -#else - printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); -#endif - break; - case 'a': - average = (int)strtol(optarg, NULL, 0); - break; - case 'h': - default: - if (c != 'h') printf("invalid option '%c'\n", c); - printf("USAGE: %s \n\t" - "[-t,--nthreads ] \n\t" - "[-g,--ngpus ] \n\t" - "[-b,--minbytes ] \n\t" - "[-e,--maxbytes ] \n\t" - "[-i,--stepbytes ] \n\t" - "[-f,--stepfactor ] \n\t" - "[-n,--iters ] \n\t" - "[-m,--agg_iters ] \n\t" - "[-w,--warmup_iters ] \n\t" - "[-p,--parallel_init <0/1>] \n\t" - "[-c,--check <0/1>] \n\t" -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - "[-o,--op ] \n\t" -#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - "[-o,--op ] \n\t" -#else - "[-o,--op ] \n\t" -#endif - "[-d,--datatype ] \n\t" - "[-r,--root ] \n\t" - "[-z,--blocking <0/1>] \n\t" - "[-G,--cudagraph ] \n\t" - "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" - "[-h,--help]\n", - basename(argv[0])); - return 0; - } - } - if (minBytes > maxBytes) { - fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", - (unsigned long long)minBytes, - (unsigned long long)maxBytes); - return -1; - } -#ifdef MPI_SUPPORT - MPI_Init(&argc, &argv); -#endif - TESTCHECK(run()); - return 0; -} - -testResult_t run() { - int nProcs = 1, proc = 0; - int localRank = 0; - char hostname[1024]; - getHostName(hostname, 1024); - -#ifdef MPI_SUPPORT - MPI_Comm_size(MPI_COMM_WORLD, &nProcs); - MPI_Comm_rank(MPI_COMM_WORLD, &proc); - uint64_t hostHashs[nProcs]; - hostHashs[proc] = getHostHash(hostname); - MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); - for (int p=0; p 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); - if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); - if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); - PRINT("#\n"); - - PRINT("# Using devices\n"); -#define MAX_LINE 2048 - char line[MAX_LINE]; - int len = 0; - size_t maxMem = ~0; - for (int i=0; i memMaxBytes) { - maxBytes = memMaxBytes; - if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); - } - - ncclUniqueId ncclId; - if (proc == 0) { - NCCLCHECK(ncclGetUniqueId(&ncclId)); - } -#ifdef MPI_SUPPORT - MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); -#endif - cudaStream_t streams[nGpus*nThreads]; - void* sendbuffs[nGpus*nThreads]; - void* recvbuffs[nGpus*nThreads]; - void* expected[nGpus*nThreads]; - size_t sendBytes, recvBytes; - - ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); - - for (int i=0; i=0; t--) { - threads[t].args.minbytes=minBytes; - threads[t].args.maxbytes=maxBytes; - threads[t].args.stepbytes=stepBytes; - threads[t].args.stepfactor=stepFactor; - threads[t].args.localRank = localRank; - - threads[t].args.nProcs=nProcs; - threads[t].args.proc=proc; - threads[t].args.nThreads=nThreads; - threads[t].args.thread=t; - threads[t].args.nGpus=nGpus; - threads[t].args.sendbuffs = sendbuffs+t*nGpus; - threads[t].args.recvbuffs = recvbuffs+t*nGpus; - threads[t].args.expected = expected+t*nGpus; - threads[t].args.ncclId = ncclId; - threads[t].args.comms=comms+t*nGpus; - threads[t].args.streams=streams+t*nGpus; - - threads[t].args.barrier = (volatile int*)barrier; - threads[t].args.barrier_idx = 0; - threads[t].args.reduce = (volatile double*)reduce; - threads[t].args.sync = (volatile int*)sync; - threads[t].args.sync_idx = 0; - threads[t].args.deltaHost = (delta + t*NUM_BLOCKS); - threads[t].args.errors=errors+t; - threads[t].args.bw=bw+t; - threads[t].args.bw_count=bw_count+t; - - threads[t].args.reportErrors = 1; - - threads[t].func = parallel_init ? threadInit : threadRunTests; - if (t) - TESTCHECK(threadLaunch(threads+t)); - else - TESTCHECK(threads[t].func(&threads[t].args)); - } - - // Wait for other threads and accumulate stats and errors - for (int t=nThreads-1; t>=0; t--) { - if (t) pthread_join(threads[t].thread, NULL); - TESTCHECK(threads[t].ret); - if (t) { - errors[0] += errors[t]; - bw[0] += bw[t]; - bw_count[0] += bw_count[t]; - } - } - -#ifdef MPI_SUPPORT - MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -#endif - - if (!parallel_init) { - for(int i=0; i Date: Fri, 9 Sep 2022 08:13:28 +0000 Subject: [PATCH 026/109] wierd check --- src/Makefile | 40 ++++++++++++------ src/common.cu | 2 +- src_simple/Makefile | 40 ++++++++++++------ src_simple/common_simple.cu | 74 ++++++++++++++++++++-------------- src_simple/ofccl_all_reduce.cu | 21 +++++++++- 5 files changed, 118 insertions(+), 59 deletions(-) diff --git a/src/Makefile b/src/Makefile index 2a399db..8cee9d8 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,7 +7,7 @@ CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 -DEBUG ?= 0 +DEBUG ?= 1 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -19,20 +19,32 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) # Better define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. -ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) -NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_80,code=sm_80 \ - -gencode=arch=compute_80,code=compute_80 +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) else -NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ - -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_70,code=compute_70 +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 @@ -72,6 +84,8 @@ endif LIBRARIES += nccl NVLDFLAGS += $(LIBRARIES:%=-l%) +$(info CARDNAME $(NVCUFLAGS)) + DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) diff --git a/src/common.cu b/src/common.cu index 72857cd..939e777 100644 --- a/src/common.cu +++ b/src/common.cu @@ -590,7 +590,7 @@ testResult_t completeColl(struct threadArgs* args) { testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); - if (datacheck) { + if (datacheck) { // 这里的目的应该是让测带宽跑的coll也使用非0数据。 // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); } diff --git a/src_simple/Makefile b/src_simple/Makefile index de282de..ccad131 100644 --- a/src_simple/Makefile +++ b/src_simple/Makefile @@ -7,7 +7,7 @@ CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 -DEBUG ?= 0 +DEBUG ?= 1 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -19,20 +19,32 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) # Better define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. -ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) -NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_80,code=sm_80 \ - -gencode=arch=compute_80,code=compute_80 +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) else -NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ - -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_70,code=compute_70 +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 @@ -72,6 +84,8 @@ endif LIBRARIES += nccl NVLDFLAGS += $(LIBRARIES:%=-l%) +$(info CARDNAME $(NVCUFLAGS)) + DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 063664d..c01218a 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -216,7 +216,7 @@ __global__ void deltaKern(void *A_, void *B_, size_t count, double *max) { } testResult_t CheckDelta(void *results, void *expected, size_t count, - ncclDataType_t type, double *devmax) { + ncclDataType_t type, double *devmax, cudaStream_t stream) { switch (type) { #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: @@ -281,7 +281,7 @@ __device__ float testValue(const size_t offset, const int rep, const int rank) { // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 // return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); - return 1.0; + return 0.25; } template <> __device__ half testValue(const size_t offset, const int rep, @@ -437,8 +437,7 @@ testResult_t InitData(void *data, const size_t count, ncclDataType_t type, dim3 grid = {32, 1, 1}; dim3 block = {256, 1, 1}; void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank}; - CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, - cudaStreamDefault)); + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); return testSuccess; } @@ -496,7 +495,7 @@ void Allreduce(struct threadArgs *args, double *value, int average) { } testResult_t CheckData(struct threadArgs *args, ncclDataType_t type, - ncclRedOp_t op, int root, int in_place, double *delta) { + ncclRedOp_t op, int root, int in_place, double *delta, cudaStream_t stream) { // 不要在默认stream上跑。 size_t count = args->expectedBytes / wordSize(type); double maxDelta = 0.0; for (int i = 0; i < args->nGpus; i++) { @@ -508,7 +507,7 @@ testResult_t CheckData(struct threadArgs *args, ncclDataType_t type, args->recvInplaceOffset * rank)) : args->recvbuffs[i]; TESTCHECK( - CheckDelta(data, args->expected[i], count, type, args->deltaHost)); + CheckDelta(data, args->expected[i], count, type, args->deltaHost, stream)); maxDelta = std::max(*(args->deltaHost), maxDelta); #ifdef DEBUG_PRINT @@ -516,15 +515,15 @@ testResult_t CheckData(struct threadArgs *args, ncclDataType_t type, int *expectedHost = (int *)malloc(args->expectedBytes); int *dataHost = (int *)malloc(args->expectedBytes); - cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, - cudaMemcpyDeviceToHost); + cudaMemcpyAsync(expectedHost, args->expected[0], args->expectedBytes, + cudaMemcpyDeviceToHost, stream); printf("\n Expected: "); for (int j = 0; j < args->expectedBytes / sizeof(int); j++) { printf("%d:%d ", j, expectedHost[j]); } printf("\n"); - cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + cudaMemcpyAsync(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost, stream); printf("\n Actual: "); for (int j = 0; j < args->expectedBytes / sizeof(int); j++) { printf("%d:%d ", j, dataHost[j]); @@ -818,15 +817,16 @@ testResult_t completeColl(struct threadArgs *args) { testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); - if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); - } + // if (datacheck) { + // // Initialize sendbuffs, recvbuffs and expected + // TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); + // } Barrier(args); // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); + // TODO: 这里要支持多轮,好像也没有很复杂。 for (int iter = 0; iter < iters; iter++) { for (int miter = 0; miter < multi_iters; miter++) { TESTCHECK(startColl(args, type, op, root, in_place, @@ -851,33 +851,35 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t Barrier(args); double maxDelta = 0; - static __thread int rep = 0; - rep++; // IF_CHECK 如果要检查对错,把下边露出来 - int printNum = 10; - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - float *ptr = (float *)malloc(printNum * sizeof(float)); - cudaMemcpy(ptr, args->recvbuffs[0], printNum * sizeof(float), cudaMemcpyDeviceToHost); - for (int i = 0; i < printNum; i++) { - OFTEST_LOG(TEST, "<%lu> rank=%d, recvbuff[%d]=%f", pthread_self(), cudaDev, i, ptr[i]); - } - free(ptr); + // int printNum = 10; + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // float *ptr = (float *)malloc(printNum * sizeof(float)); + // cudaMemcpy(ptr, args->recvbuffs[0], printNum * sizeof(float), cudaMemcpyDeviceToHost); + // for (int i = 0; i < printNum; i++) { + // OFTEST_LOG(TEST, "<%lu> rank=%d, recvbuff[%d]=%f", pthread_self(), cudaDev, i, ptr[i]); + // } + // free(ptr); if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); //test validation in single itertion, should ideally be included into the multi-iteration run - TESTCHECK(startColl(args, type, op, root, in_place, 0, 0)); + // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0)); // will set cbArgList[0].gotCqe = 0 + + // // // TESTCHECK(completeColl(args)); + // pthread_mutex_lock(&cbArgList[0].mutex); + // while (cbArgList[0].gotCqe == 0) { - TESTCHECK(completeColl(args)); + // } + // pthread_mutex_unlock(&cbArgList[0].mutex); + - TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + // TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta, args->streams[0])); - //aggregate delta from all threads and procs - Allreduce(args, &maxDelta, 3); + // //aggregate delta from all threads and procs + // Allreduce(args, &maxDelta, 3); } double timeUsec = deltaSec * 1.0E6; @@ -932,6 +934,16 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, } } + // 在这里完成check数据的准备; + static __thread int rep = 0; + rep++; + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0)); + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + } + ofcclPrepareDone(); } diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 2b336d4..b4af9bc 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -36,16 +36,34 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + + // OFTEST_LOG(TEST, "<%lu> rank=%d, AllReduceInitData get gpuid=%d", pthread_self(), cudaDev, gpuid); + CUDACHECK(cudaSetDevice(gpuid)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + // // OFTEST_LOG(TEST, "<%lu> rank=%d, AllReduceInitData get int rank=%d", pthread_self(), cudaDev, rank); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + // // OFTEST_LOG(TEST, "<%lu> rank=%d, done cudaMemset", pthread_self(), cudaDev); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, type, rep, rank)); + // OFTEST_LOG(TEST, "<%lu> rank=%d, done InitData", pthread_self(), cudaDev); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + // // OFTEST_LOG(TEST, "<%lu> rank=%d, done InitDataReduce", pthread_self(), cudaDev); + CUDACHECK(cudaDeviceSynchronize()); + + // OFTEST_LOG(TEST, "<%lu> rank=%d, done cudaDeviceSynchronize", pthread_self(), cudaDev); + } + OFTEST_LOG(TEST, "<%lu> rank=%d, done AllReduceInitData", pthread_self(), cudaDev); return testSuccess; } @@ -70,7 +88,7 @@ int myCallback(int collIdFromCqe, void *args) { pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); ((CallBackArgs *)args)->gotCqe = 1; pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); - // OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); return 0; } @@ -85,6 +103,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args)); // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); + OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); return testSuccess; } From 732f8bdf4553aac21711e5d8307d3cad57e49b7e Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 9 Sep 2022 09:05:40 +0000 Subject: [PATCH 027/109] activate -n, can run multi-iters --- src_simple/common_simple.cu | 18 +++++++++++------- src_simple/ofccl_all_reduce.cu | 20 +++----------------- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index c01218a..bd3cb5e 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -792,8 +792,6 @@ testResult_t completeColl(struct threadArgs *args) { if (blocking_coll) return testSuccess; - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); int gotCqeCnt = 0; while (gotCqeCnt < multi_iters) { @@ -803,14 +801,18 @@ testResult_t completeColl(struct threadArgs *args) { if (seenCqe[i] == 0) { gotCqeCnt++; seenCqe[i] = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // if (cudaDev == 0) { + // OFTEST_LOG(TEST, "<%lu> rank=%d, completeColl get cqe for collId %d", pthread_self(), cudaDev, i); + // } + } } pthread_mutex_unlock(&cbArgList[i].mutex); } - // OFTEST_LOG(TEST, "<%lu> rank=%d, completeColl gotCqeCnt = %d", pthread_self(), cudaDev, gotCqeCnt); } - - // TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); return testSuccess; } @@ -828,13 +830,15 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t auto start = std::chrono::high_resolution_clock::now(); // TODO: 这里要支持多轮,好像也没有很复杂。 for (int iter = 0; iter < iters; iter++) { + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; TESTCHECK(startColl(args, type, op, root, in_place, iter * multi_iters + miter, miter)); } - } - TESTCHECK(completeColl(args)); + TESTCHECK(completeColl(args)); + } auto delta = std::chrono::high_resolution_clock::now() - start; double deltaSec = diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index b4af9bc..b7169f9 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -41,29 +41,15 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - - // OFTEST_LOG(TEST, "<%lu> rank=%d, AllReduceInitData get gpuid=%d", pthread_self(), cudaDev, gpuid); - CUDACHECK(cudaSetDevice(gpuid)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - // // OFTEST_LOG(TEST, "<%lu> rank=%d, AllReduceInitData get int rank=%d", pthread_self(), cudaDev, rank); - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); - // // OFTEST_LOG(TEST, "<%lu> rank=%d, done cudaMemset", pthread_self(), cudaDev); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, type, rep, rank)); - // OFTEST_LOG(TEST, "<%lu> rank=%d, done InitData", pthread_self(), cudaDev); - TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); - // // OFTEST_LOG(TEST, "<%lu> rank=%d, done InitDataReduce", pthread_self(), cudaDev); - CUDACHECK(cudaDeviceSynchronize()); - - // OFTEST_LOG(TEST, "<%lu> rank=%d, done cudaDeviceSynchronize", pthread_self(), cudaDev); - } - OFTEST_LOG(TEST, "<%lu> rank=%d, done AllReduceInitData", pthread_self(), cudaDev); + // OFTEST_LOG(TEST, "<%lu> rank=%d, done AllReduceInitData", pthread_self(), cudaDev); return testSuccess; } @@ -88,7 +74,7 @@ int myCallback(int collIdFromCqe, void *args) { pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); ((CallBackArgs *)args)->gotCqe = 1; pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); - OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + // OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); return 0; } @@ -103,7 +89,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args)); // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); - OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); return testSuccess; } From 85d5cbd40afa8fe354c57f791c206f9b8583e751 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 26 Sep 2022 02:48:47 +0000 Subject: [PATCH 028/109] + warmup --- src_simple/common_simple.cu | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index bd3cb5e..275f68a 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -828,7 +828,6 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); - // TODO: 这里要支持多轮,好像也没有很复杂。 for (int iter = 0; iter < iters; iter++) { for (int miter = 0; miter < multi_iters; miter++) { @@ -952,26 +951,16 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, } // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 - // TODO: 同时如果要warmup的话,也要准备相应的callbackArgs。比较麻烦;可以考虑对比实验的时候,nccl和ofccl都不开warmup。 - // Warm-up for large size - // setupArgs(args->maxbytes, type, args); - // for (int iter = 0; iter < warmup_iters; iter++) { - // for (int miter = 0; miter < multi_iters; miter++) { - // TESTCHECK(startColl(args, type, op, root, 0, - // iter * multi_iters + miter, miter)); - // } - // } - // TESTCHECK(completeColl(args)); - - // // Warm-up for small size - // setupArgs(args->minbytes, type, args); - // for (int iter = 0; iter < warmup_iters; iter++) { - // for (int miter = 0; miter < multi_iters; miter++) { - // TESTCHECK(startColl(args, type, op, root, 0, - // iter * multi_iters + miter, miter)); - // } - // } - // TESTCHECK(completeColl(args)); + // warmup还是需要开,不然ofccl性能拉胯。 + setupArgs(args->maxbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(startColl(args, type, op, root, 0, + iter * multi_iters + miter, miter)); + } + } + TESTCHECK(completeColl(args)); + // Benchmark for (size_t size = args->minbytes; size <= args->maxbytes; @@ -1415,6 +1404,11 @@ testResult_t run() { for (int t = nThreads - 1; t >= 0; t--) { threads[t].args.minbytes = minBytes; threads[t].args.maxbytes = maxBytes; + // TODO: 不支持多个size。 + if (minBytes != maxBytes) { + OFTEST_LOG1(TEST_FATAL, "Only supports single size now"); + return testInternalError; + } threads[t].args.stepbytes = stepBytes; threads[t].args.stepfactor = stepFactor; threads[t].args.localRank = localRank; From 090185c0feaa996d2d83a74e4b36f3adb7170b8d Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 26 Sep 2022 07:55:32 +0000 Subject: [PATCH 029/109] fix completeColl in warmup --- src_simple/common_simple.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 275f68a..1f3e97b 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -954,13 +954,12 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, // warmup还是需要开,不然ofccl性能拉胯。 setupArgs(args->maxbytes, type, args); for (int iter = 0; iter < warmup_iters; iter++) { - for (int miter = 0; miter < multi_iters; miter++) { - TESTCHECK(startColl(args, type, op, root, 0, - iter * multi_iters + miter, miter)); - } + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(startColl(args, type, op, root, 0, + iter * multi_iters + miter, miter)); + } + TESTCHECK(completeColl(args)); } - TESTCHECK(completeColl(args)); - // Benchmark for (size_t size = args->minbytes; size <= args->maxbytes; From 17197fab885c1974f6cb9758ac6cf2955ea6db05 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 30 Sep 2022 07:16:45 +0000 Subject: [PATCH 030/109] try context --- src_simple/common_simple.cu | 76 ++++++++++++++++++---------------- src_simple/common_simple.h | 4 +- src_simple/ofccl_all_reduce.cu | 8 ++-- 3 files changed, 47 insertions(+), 41 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 1f3e97b..d748ab8 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -589,7 +589,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams, } testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, - ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) { + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { size_t count = args->nbytes / wordSize(type); if (args->nGpus != 1) { OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs"); @@ -664,7 +664,7 @@ testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, &op, &u64, type, ncclScalarHostImmediate, comm)); } #endif - TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter)); + TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx)); #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) if (opIndex >= ncclNumOps) { @@ -677,7 +677,7 @@ testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, } testResult_t startColl(struct threadArgs *args, ncclDataType_t type, - ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) { + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { size_t count = args->nbytes / wordSize(type); // Try to change offset for each iteration so that we avoid cache effects and @@ -766,7 +766,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type, (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank : sendBuff), (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank - : recvBuff), miter, cbArgList + miter)); + : recvBuff), miter, cbArgList + miter, rankCtx)); #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) if (opIndex >= ncclNumOps) { @@ -816,7 +816,7 @@ testResult_t completeColl(struct threadArgs *args) { return testSuccess; } -testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { +testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) { size_t count = args->nbytes / wordSize(type); // if (datacheck) { @@ -833,7 +833,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t for (int miter = 0; miter < multi_iters; miter++) { seenCqe[miter] = 0; TESTCHECK(startColl(args, type, op, root, in_place, - iter * multi_iters + miter, miter)); + iter * multi_iters + miter, miter, rankCtx)); } TESTCHECK(completeColl(args)); @@ -869,7 +869,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t if (datacheck) { //test validation in single itertion, should ideally be included into the multi-iteration run - // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0)); // will set cbArgList[0].gotCqe = 0 + // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0, rankCtx)); // will set cbArgList[0].gotCqe = 0 // // // TESTCHECK(completeColl(args)); // pthread_mutex_lock(&cbArgList[0].mutex); @@ -925,30 +925,36 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) { testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, const char *typeName, ncclRedOp_t op, const char *opName, int root, bool is_ofccl) { - if (is_ofccl) { - // prepare for all size. op, type traversed in the caller. - // TODO: if we support multi size, each size should use a separate ncclComm - for (size_t size = args->minbytes; size <= args->maxbytes; - size = ((args->stepfactor > 1) ? size * args->stepfactor - : size + args->stepbytes)) { - setupArgs(size, type, args); - for (int miter = 0; miter < multi_iters; miter++) { - TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter)); - } + // if (is_ofccl) { + // 首先创建ofcclRankCtx_t + int thrdCudaDev; + CUDACHECK(cudaGetDevice(&thrdCudaDev)); + ofcclRankCtx_t rankCtx; + ofcclInitRankCtx(&rankCtx, thrdCudaDev); + + // prepare for all size. op, type traversed in the caller. + // TODO: if we support multi size, each size should use a separate ncclComm + for (size_t size = args->minbytes; size <= args->maxbytes; + size = ((args->stepfactor > 1) ? size * args->stepfactor + : size + args->stepbytes)) { + setupArgs(size, type, args); + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx)); } + } - // 在这里完成check数据的准备; - static __thread int rep = 0; - rep++; - if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0)); - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - } - - ofcclPrepareDone(); + // 在这里完成check数据的准备; + static __thread int rep = 0; + rep++; + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0)); + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); } + + ofcclPrepareDone(rankCtx); + // } // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 // warmup还是需要开,不然ofccl性能拉胯。 @@ -956,7 +962,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, for (int iter = 0; iter < warmup_iters; iter++) { for (int miter = 0; miter < multi_iters; miter++) { TESTCHECK(startColl(args, type, op, root, 0, - iter * multi_iters + miter, miter)); + iter * multi_iters + miter, miter, rankCtx)); } TESTCHECK(completeColl(args)); } @@ -968,15 +974,15 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, setupArgs(size, type, args); print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); - TESTCHECK(BenchTime(args, type, op, root, 0)); - // TESTCHECK(BenchTime(args, type, op, root, 1)); + TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); + // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); PRINT("\n"); } - if (is_ofccl) { - // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self()); - ofcclDestroy(); - } + // if (is_ofccl) { + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self()); + ofcclDestroy(rankCtx); + // } return testSuccess; } diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index bf2d0fd..dc75f47 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -81,8 +81,8 @@ struct testColl { testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place); void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); - testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args); - testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx); + testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx); }; extern struct testColl allReduceTest; extern struct testColl allGatherTest; diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index b7169f9..fa69a13 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -78,7 +78,7 @@ int myCallback(int collIdFromCqe, void *args) { return 0; } -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args) { +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); @@ -87,16 +87,16 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa args->gotCqe = 0; pthread_mutex_init(&args->mutex, NULL); - NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args)); + NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); return testSuccess; } -testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId) { +testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { - NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId)); + NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx)); // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); return testSuccess; } From 5f399fd4f4ef94d23ee038b08a41ce8965815a62 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 2 Oct 2022 14:22:20 +0000 Subject: [PATCH 031/109] bugfix: seenCqe[miter] = 0; in warmup --- src_simple/common_simple.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index d748ab8..dea54ce 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -961,6 +961,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, setupArgs(args->maxbytes, type, args); for (int iter = 0; iter < warmup_iters; iter++) { for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; TESTCHECK(startColl(args, type, op, root, 0, iter * multi_iters + miter, miter, rankCtx)); } From 5cd2cb8a0d702c3f1482691208c5fd3f03658f47 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 6 Oct 2022 15:03:41 +0000 Subject: [PATCH 032/109] polish callback --- src_simple/ofccl_all_reduce.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index fa69a13..dec9d32 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -62,15 +62,15 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl } int myCallback(int collIdFromCqe, void *args) { - // TODO: 不打log把这里删了,不然影响性能。 - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - int collId = ((CallBackArgs *)args)->collId; - if (collId != collIdFromCqe) { - // TODO: more robust error handle. - OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); - return -1; - } + // 不打log把这里删了,不然影响性能。 + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); ((CallBackArgs *)args)->gotCqe = 1; pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); From 014862887f9dbdf800d22ba7b4786443f8c6137d Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 9 Oct 2022 07:47:04 +0000 Subject: [PATCH 033/109] check OK --- src_simple/common_simple.cu | 182 +++++++++++++++--------------------- 1 file changed, 73 insertions(+), 109 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index dea54ce..d193880 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -215,50 +215,35 @@ __global__ void deltaKern(void *A_, void *B_, size_t count, double *max) { max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; } -testResult_t CheckDelta(void *results, void *expected, size_t count, - ncclDataType_t type, double *devmax, cudaStream_t stream) { +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { switch (type) { #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: - deltaKern<__nv_bfloat16, 512> - <<>>(results, expected, count, devmax); - break; + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; #endif - case ncclHalf: - deltaKern<<>>(results, expected, count, devmax); - break; - case ncclFloat: - deltaKern - <<>>(results, expected, count, devmax); - break; - case ncclDouble: - deltaKern - <<>>(results, expected, count, devmax); - break; - - case ncclChar: + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: #if NCCL_MAJOR >= 2 - case ncclUint8: + case ncclUint8: #endif - deltaKern - <<>>(results, expected, count, devmax); - break; - case ncclInt: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: #if NCCL_MAJOR >= 2 - case ncclUint32: + case ncclUint32: #endif - deltaKern - <<>>(results, expected, count, devmax); - break; - case ncclInt64: - case ncclUint64: - deltaKern - <<>>(results, expected, count, devmax); - break; + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; } CUDACHECK(cudaDeviceSynchronize()); - for (int i = 1; i < NUM_BLOCKS; i++) - devmax[0] = std::max(devmax[0], devmax[i]); + for (int i=1; i(const size_t offset, const int rep, const int rank) { // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 // return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); - return 0.25; + return 1.0 / 3.0; } template <> __device__ half testValue(const size_t offset, const int rep, @@ -494,53 +479,48 @@ void Allreduce(struct threadArgs *args, double *value, int average) { args->barrier_idx = !args->barrier_idx; } -testResult_t CheckData(struct threadArgs *args, ncclDataType_t type, - ncclRedOp_t op, int root, int in_place, double *delta, cudaStream_t stream) { // 不要在默认stream上跑。 - size_t count = args->expectedBytes / wordSize(type); +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); double maxDelta = 0.0; - for (int i = 0; i < args->nGpus; i++) { + for (int i=0; inGpus; i++) { int device; - int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); CUDACHECK(cudaSetDevice(device)); - void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + - args->recvInplaceOffset * rank)) - : args->recvbuffs[i]; - TESTCHECK( - CheckDelta(data, args->expected[i], count, type, args->deltaHost, stream)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); maxDelta = std::max(*(args->deltaHost), maxDelta); #ifdef DEBUG_PRINT if (rank == 0) { - int *expectedHost = (int *)malloc(args->expectedBytes); - int *dataHost = (int *)malloc(args->expectedBytes); - - cudaMemcpyAsync(expectedHost, args->expected[0], args->expectedBytes, - cudaMemcpyDeviceToHost, stream); - printf("\n Expected: "); - for (int j = 0; j < args->expectedBytes / sizeof(int); j++) { - printf("%d:%d ", j, expectedHost[j]); - } - printf("\n"); - - cudaMemcpyAsync(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost, stream); - printf("\n Actual: "); - for (int j = 0; j < args->expectedBytes / sizeof(int); j++) { - printf("%d:%d ", j, dataHost[j]); - } - printf("\n"); - free(expectedHost); - free(dataHost); + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); } #endif } - double nranks = args->nProcs * args->nThreads * args->nGpus; - if (args->reportErrors && maxDelta > DeltaMaxValue(type) * (nranks - 1)) - args->errors[0]++; + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; *delta = maxDelta; return testSuccess; } + testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams, ncclComm_t *comms) { cudaError_t cudaErr; @@ -819,10 +799,6 @@ testResult_t completeColl(struct threadArgs *args) { testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) { size_t count = args->nbytes / wordSize(type); - // if (datacheck) { - // // Initialize sendbuffs, recvbuffs and expected - // TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); - // } Barrier(args); @@ -854,35 +830,21 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t Barrier(args); double maxDelta = 0; - - // IF_CHECK 如果要检查对错,把下边露出来 - // int printNum = 10; - // int cudaDev; - // CUDACHECK(cudaGetDevice(&cudaDev)); - // float *ptr = (float *)malloc(printNum * sizeof(float)); - // cudaMemcpy(ptr, args->recvbuffs[0], printNum * sizeof(float), cudaMemcpyDeviceToHost); - // for (int i = 0; i < printNum; i++) { - // OFTEST_LOG(TEST, "<%lu> rank=%d, recvbuff[%d]=%f", pthread_self(), cudaDev, i, ptr[i]); - // } - // free(ptr); - + // static __thread int rep = 0; // 为了再次初始化buffer的参数,没用了。 + // rep++; if (datacheck) { - //test validation in single itertion, should ideally be included into the multi-iteration run - // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0, rankCtx)); // will set cbArgList[0].gotCqe = 0 - - // // // TESTCHECK(completeColl(args)); - // pthread_mutex_lock(&cbArgList[0].mutex); - // while (cbArgList[0].gotCqe == 0) { - - // } - // pthread_mutex_unlock(&cbArgList[0].mutex); - + // seenCqe[0] = 0; + // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0, rankCtx)); + // TESTCHECK(completeColl(args)); - // TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta, args->streams[0])); + ofcclDestroy(rankCtx); + // TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); // //aggregate delta from all threads and procs // Allreduce(args, &maxDelta, 3); + } else { + ofcclDestroy(rankCtx); } double timeUsec = deltaSec * 1.0E6; @@ -946,11 +908,13 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, // 在这里完成check数据的准备; static __thread int rep = 0; rep++; - if (datacheck) { + if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。 // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0)); - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> rank=%d, initData OK", pthread_self(), cudaDev); } ofcclPrepareDone(rankCtx); @@ -969,20 +933,20 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, } // Benchmark - for (size_t size = args->minbytes; size <= args->maxbytes; - size = ((args->stepfactor > 1) ? size * args->stepfactor - : size + args->stepbytes)) { - setupArgs(size, type, args); - print_line_header(max(args->sendBytes, args->expectedBytes), - args->nbytes / wordSize(type), typeName, opName, root); - TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); - // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); - PRINT("\n"); - } + // for (size_t size = args->minbytes; size <= args->maxbytes; + // size = ((args->stepfactor > 1) ? size * args->stepfactor + // : size + args->stepbytes)) { + // setupArgs(size, type, args); + print_line_header(max(args->sendBytes, args->expectedBytes), + args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); + // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); + PRINT("\n"); + // } // if (is_ofccl) { // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self()); - ofcclDestroy(rankCtx); + // ofcclDestroy(rankCtx); // 为了做check,把这个挪到BenchTime里边。 // } return testSuccess; From b6027be0584a4f7d64bef6aeae7664a4fd457484 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 9 Oct 2022 11:59:01 +0000 Subject: [PATCH 034/109] check ok --- src_simple/common_simple.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index d193880..622d94e 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -265,8 +265,8 @@ template <> __device__ float testValue(const size_t offset, const int rep, const int rank) { // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 - // return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); - return 1.0 / 3.0; + return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + // return 1.0 / 2.0; } template <> __device__ half testValue(const size_t offset, const int rep, From 24290c64e1a757c48d1e18b77096f23cbf9b6edf Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 9 Oct 2022 12:21:45 +0000 Subject: [PATCH 035/109] restore semi-original NCCL's BenchTime --- src/common.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common.cu b/src/common.cu index 939e777..110d55a 100644 --- a/src/common.cu +++ b/src/common.cu @@ -597,8 +597,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t // Sync // TODO: 之后恢复? - // TESTCHECK(startColl(args, type, op, root, in_place, 0)); - // TESTCHECK(completeColl(args)); + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + TESTCHECK(completeColl(args)); Barrier(args); From d60903922c9a59e7aebf09e047e24d5e0938a315 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Wed, 12 Oct 2022 15:12:01 +0000 Subject: [PATCH 036/109] run check smoothly --- src_simple/common_simple.cu | 20 +++++++------------- src_simple/common_simple.h | 2 ++ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 622d94e..2f01418 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -265,8 +265,8 @@ template <> __device__ float testValue(const size_t offset, const int rep, const int rank) { // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 - return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); - // return 1.0 / 2.0; + // return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + return 1.0 / 1.0; } template <> __device__ half testValue(const size_t offset, const int rep, @@ -829,22 +829,16 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t Barrier(args); + ofcclDestroy(rankCtx); + double maxDelta = 0; // static __thread int rep = 0; // 为了再次初始化buffer的参数,没用了。 // rep++; if (datacheck) { - //test validation in single itertion, should ideally be included into the multi-iteration run - // seenCqe[0] = 0; - // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0, rankCtx)); - // TESTCHECK(completeColl(args)); - - ofcclDestroy(rankCtx); - // TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); - // //aggregate delta from all threads and procs - // Allreduce(args, &maxDelta, 3); - } else { - ofcclDestroy(rankCtx); + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); } double timeUsec = deltaSec * 1.0E6; diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index dc75f47..406f634 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -16,6 +16,8 @@ #include #include "nccl1_compat.h" +// #define DEBUG_PRINT 1 + #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) #define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__) From 4cd20919b8cfdc85dba915578e90cf3d0d94ad9c Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 13 Oct 2022 03:31:18 +0000 Subject: [PATCH 037/109] finalize check --- src/common.cu | 45 ------------------------------------- src_simple/common_simple.cu | 4 ++-- 2 files changed, 2 insertions(+), 47 deletions(-) diff --git a/src/common.cu b/src/common.cu index 110d55a..9c2588a 100644 --- a/src/common.cu +++ b/src/common.cu @@ -596,7 +596,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } // Sync - // TODO: 之后恢复? TESTCHECK(startColl(args, type, op, root, in_place, 0)); TESTCHECK(completeColl(args)); @@ -674,50 +673,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t static __thread int rep = 0; rep++; if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); - -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - // Begin cuda graph capture for data check - for (int i=0; inGpus; i++) { - CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); - } - } -#endif - - //test validation in single itertion, should ideally be included into the multi-iteration run - TESTCHECK(startColl(args, type, op, root, in_place, 0)); - -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - // End cuda graph capture - for (int i=0; inGpus; i++) { - CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); - } - // Instantiate cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); - } - // Launch cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); - } - } -#endif - - TESTCHECK(completeColl(args)); - -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - //destroy cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphExecDestroy(graphExec[i])); - CUDACHECK(cudaGraphDestroy(graphs[i])); - } - } -#endif - TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); //aggregate delta from all threads and procs diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 2f01418..8168869 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -265,8 +265,8 @@ template <> __device__ float testValue(const size_t offset, const int rep, const int rank) { // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 - // return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); - return 1.0 / 1.0; + return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + // return 1.0 / 1.0; } template <> __device__ half testValue(const size_t offset, const int rep, From b8dc018fe3f14b2a217bf2ff453f8e2dfcccc62c Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 13 Oct 2022 15:26:11 +0000 Subject: [PATCH 038/109] adapt to volunteer quit --- src_simple/common_simple.cu | 12 ++++++++---- src_simple/ofccl_all_reduce.cu | 17 +++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 622d94e..52f3174 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -785,7 +785,7 @@ testResult_t completeColl(struct threadArgs *args) { // int cudaDev; // CUDACHECK(cudaGetDevice(&cudaDev)); // if (cudaDev == 0) { - // OFTEST_LOG(TEST, "<%lu> rank=%d, completeColl get cqe for collId %d", pthread_self(), cudaDev, i); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for collId %d", pthread_self(), cudaDev, i); // } } @@ -914,10 +914,14 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, // int cudaDev; // CUDACHECK(cudaGetDevice(&cudaDev)); - // OFTEST_LOG(TEST, "<%lu> rank=%d, initData OK", pthread_self(), cudaDev); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), cudaDev); } - ofcclPrepareDone(rankCtx); + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclPrepareDone from TimeTest", pthread_self(), cudaDev); + ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 + // ofcclFinalizeRankCtx7StartHostThrds(rankCtx); // } // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 @@ -1239,7 +1243,7 @@ testResult_t run() { int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); - OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, multi_iters = %d", pthread_self(), cudaDev, multi_iters); + OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters); #define MAX_LINE 2048 char line[MAX_LINE]; int len = 0; diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index dec9d32..049b69c 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -49,7 +49,7 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); CUDACHECK(cudaDeviceSynchronize()); } - // OFTEST_LOG(TEST, "<%lu> rank=%d, done AllReduceInitData", pthread_self(), cudaDev); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev); return testSuccess; } @@ -63,18 +63,19 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl int myCallback(int collIdFromCqe, void *args) { // 不打log把这里删了,不然影响性能。 - // int cudaDev; - // CUDACHECK(cudaGetDevice(&cudaDev)); - // int collId = ((CallBackArgs *)args)->collId; // if (collId != collIdFromCqe) { // // more robust error handle. - // OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); // return -1; // } pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); ((CallBackArgs *)args)->gotCqe = 1; pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); - // OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + int collId = ((CallBackArgs *)args)->collId; + OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId); return 0; } @@ -88,8 +89,8 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa pthread_mutex_init(&args->mutex, NULL); NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); - // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); - // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); return testSuccess; } From bd105235de11cb9c3fe57201d297edf1dbab2b00 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 13 Oct 2022 19:47:22 +0000 Subject: [PATCH 039/109] adapt to volunteer quit --- src_simple/common_simple.cu | 2 +- src_simple/ofccl_all_reduce.cu | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 9ed7393..55fc804 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -913,7 +913,6 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); - OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclPrepareDone from TimeTest", pthread_self(), cudaDev); ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 // ofcclFinalizeRankCtx7StartHostThrds(rankCtx); // } @@ -928,6 +927,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, iter * multi_iters + miter, miter, rankCtx)); } TESTCHECK(completeColl(args)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), cudaDev, iter, multi_iters); } // Benchmark diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 049b69c..0c5593b 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -72,10 +72,10 @@ int myCallback(int collIdFromCqe, void *args) { ((CallBackArgs *)args)->gotCqe = 1; pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - int collId = ((CallBackArgs *)args)->collId; - OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId); return 0; } From 74d4f0def1d74bf94a67b80b231a56165a4d33af Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 14 Oct 2022 11:45:05 +0000 Subject: [PATCH 040/109] keep the report log --- src_simple/common_simple.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 55fc804..e1149ed 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -813,6 +813,10 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t } TESTCHECK(completeColl(args)); + + int cudaDev; + cudaGetDevice(&cudaDev); + OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); } auto delta = std::chrono::high_resolution_clock::now() - start; From ed7f645ddb224dce418045ea3d6a9f4960ebc7da Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 17 Oct 2022 06:58:41 +0000 Subject: [PATCH 041/109] try pure inplace --- src_inplace/Makefile | 109 ++ src_inplace/common_inplace.cu | 1477 +++++++++++++++++ src_inplace/common_inplace.h | 289 ++++ src_inplace/nccl1_compat.h | 50 + .../ofccl_all_reduce_inp.cu | 61 +- src_simple/all_reduce_group.cu | 143 -- src_simple/common_simple.cu | 4 +- 7 files changed, 1980 insertions(+), 153 deletions(-) create mode 100644 src_inplace/Makefile create mode 100644 src_inplace/common_inplace.cu create mode 100644 src_inplace/common_inplace.h create mode 100644 src_inplace/nccl1_compat.h rename src_simple/all_reduce_simple.cu => src_inplace/ofccl_all_reduce_inp.cu (63%) delete mode 100644 src_simple/all_reduce_group.cu diff --git a/src_inplace/Makefile b/src_inplace/Makefile new file mode 100644 index 0000000..8b0e124 --- /dev/null +++ b/src_inplace/Makefile @@ -0,0 +1,109 @@ +# +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +CUDA_HOME ?= /usr/local/cuda +PREFIX ?= /usr/local +VERBOSE ?= 0 +DEBUG ?= 1 + +CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include +NVCC = $(CUDA_HOME)/bin/nvcc +CUDARTLIB ?= cudart + +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) +else +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) +endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt + +ifeq ($(DEBUG), 0) +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else +.SILENT: +endif + +.PHONY: build clean + +BUILDDIR ?= ../build +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +ifeq ($(MPI), 1) +NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include +NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi +endif +ifeq ($(MPI_IBM),1) +NVCUFLAGS += -DMPI_SUPPORT +NVLDFLAGS += -lmpi_ibm +endif +LIBRARIES += nccl +NVLDFLAGS += $(LIBRARIES:%=-l%) + +$(info CARDNAME $(NVCUFLAGS)) + +DST_DIR := $(BUILDDIR) +SRC_FILES := $(wildcard *.cu) +OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) +BIN_FILES_LIST := ofccl_all_reduce_inp +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) + +build: ${BIN_FILES} + +clean: + rm -rf ${DST_DIR} + +${DST_DIR}/%.o: %.cu common_inplace.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_inplace.o + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + diff --git a/src_inplace/common_inplace.cu b/src_inplace/common_inplace.cu new file mode 100644 index 0000000..023030b --- /dev/null +++ b/src_inplace/common_inplace.cu @@ -0,0 +1,1477 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "common_inplace.h" +#include "cuda.h" +#include "nccl.h" +#include +#include +#include +#include +#include + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +#if NCCL_MAJOR >= 2 +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, + ncclUint8, + ncclInt32, + ncclUint32, + ncclInt64, + ncclUint64, + ncclHalf, + ncclFloat, + ncclDouble +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclBfloat16 +#endif +}; +const char *test_typenames[ncclNumTypes] = {"int8", + "uint8", + "int32", + "uint32", + "int64", + "uint64", + "half", + "float", + "double" +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + "bfloat16" +#endif +}; +int test_typenum = -1; + +const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; +ncclRedOp_t test_ops[] = { + ncclSum, + ncclProd, + ncclMax, + ncclMin +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclAvg +#endif +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + , + ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand +#endif +}; +int test_opnum = -1; +#else +ncclDataType_t test_types[ncclNumTypes] = { + ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; +const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", + "double", "int64", "uint64"}; +int test_typenum = 7; +const char *test_opnames[] = {"sum", "prod", "max", "min"}; +ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; +int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32 * 1024 * 1024; +static size_t maxBytes = 32 * 1024 * 1024; +static size_t stepBytes = 1 * 1024 * 1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = 1; +static int multi_iters = 1; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static thread_local CallBackArgs cbArgList[MAX_COLL_NUM]; +static thread_local int seenCqe[MAX_COLL_NUM]; + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024 * 1024 * 1024; + break; + case 'M': + case 'm': + units = 1024 * 1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch (type) { + case ncclHalf: + return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + return 1e-2; +#endif + case ncclFloat: + return 1e-5; + case ncclDouble: + return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + // case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: + return 1e-200; + } + return 1e-200; +} + +template __device__ double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template <> __device__ double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y - x)); +} + +template __device__ float toFloat(T a) { return (float)a; } +template <> __device__ float toFloat(half a) { return __half2float(a); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template +__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) { + const T *A = (const T *)A_; + const T *B = (const T *)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + double locmax = 0.0; + for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) { + + double delta = absDiff(A[i], B[i]); + if (delta > locmax) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) + printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i, + toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for (int stride = BSIZE / 2; stride > 1; stride >>= 1) { + __syncthreads(); + if (tid < stride) + temp[tid] = + temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride]; + } + __syncthreads(); + if (threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep + rank + offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template <> +__device__ double testValue(const size_t offset, const int rep, + const int rank) { + return 1.0 / (1.0 + (double)testValue(offset, rep, rank)); +} +template <> +__device__ float testValue(const size_t offset, const int rep, + const int rank) { + // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 + return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + // return 1.0 / 1.0; +} +template <> +__device__ half testValue(const size_t offset, const int rep, + const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, + const int rep, + const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template __device__ T ncclOpSum(T a, T b) { return a + b; } +template __device__ T ncclOpProd(T a, T b) { return a * b; } +template __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; } +template __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; } + +// Definitions for half +template <> __device__ half ncclOpSum(half a, half b) { + return __float2half(__half2float(a) + __half2float(b)); +} +template <> __device__ half ncclOpProd(half a, half b) { + return __float2half(__half2float(a) * __half2float(b)); +} +template <> __device__ half ncclOpMax(half a, half b) { + return __half2float(a) > __half2float(b) ? a : b; +} +template <> __device__ half ncclOpMin(half a, half b) { + return __half2float(a) < __half2float(b) ? a : b; +} + +template __device__ T ncclPPOpIdent(T x, int arg) { return x; } +template __device__ T ncclPPOpMul(T x, int arg) { + return x * T(arg); +} +template __device__ T ncclPPOpDiv(T x, int arg) { + return x / T(arg); +} +template <> __device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x) * float(arg)); +} +template <> __device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x) / n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x) * float(arg)); +} +template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x) / n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; } + +template +__global__ void InitDataReduceKernel(T *data, const size_t N, + const size_t offset, const int rep, + const int nranks) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) { + T val = testValue(o + offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i = 1; i < nranks; i++) { + T val1 = testValue(o + offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) \ + (void *)InitDataReduceKernel, preop, postop> +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), + OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void *data, const size_t count, const size_t offset, + ncclDataType_t type, ncclRedOp_t op, const int rep, + const int nranks) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep, + (void *)&nranks}; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid, + block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T *data, const size_t N, const int rep, + const int rank) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) + data[o] = testValue(o, rep, rank); +} + +static void *const initDataKerns[ncclNumTypes] = { + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + (void *)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void *dest, const size_t N, const int rep, + const int rank) { + T *ptr = (T *)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void *data, const size_t count, ncclDataType_t type, + const int rep, const int rank) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank}; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs *args) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + args->barrier_idx = !args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs *args, double *value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) + val += val2; + if (average == 2) + val = std::min(val, val2); + if (average == 3) + val = std::max(val, val2); + } + if (average || args->thread == 0) + args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1, + MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) + args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads; + args->reduce[1 - args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx = !args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams, + ncclComm_t *comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int *done = (int *)malloc(sizeof(int) * ngpus); + memset(done, 0, sizeof(int) * ngpus); + while (remaining) { + int idle = 1; + for (int i = 0; i < ngpus; i++) { + if (done[i]) + continue; + + cudaErr = cudaStreamQuery(streams[i]); + if (cudaErr == cudaSuccess) { + done[i] = 1; + remaining--; + idle = 0; + continue; + } + + if (cudaErr != cudaErrorNotReady) + CUDACHECK(cudaErr); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i = 0; i < ngpus; i++) + NCCLCHECK(ncclCommAbort(comms[i])); + // Abort the perf test + NCCLCHECK(ncclAsyncErr); + } + } +#endif + } + + // We might want to let other threads (including NCCL threads) use the CPU. + if (idle) + pthread_yield(); + } + free(done); + return testSuccess; +} + +testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + if (args->nGpus != 1) { + OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs"); + return testInternalError; + } + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); + + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + + return testSuccess; +} + +testResult_t startColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart"); + NCCLCHECK(ncclGroupStart()); + } + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm); +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(comm, &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + char *recvBuff = ((char *)args->recvbuffs[i]) + shift; + char *sendBuff = ((char *)args->sendbuffs[i]) + shift; + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + // miter就是collId。 + TESTCHECK(args->collTest->runColl( + (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank + : sendBuff), + (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank + : recvBuff), miter, cbArgList + miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd"); + NCCLCHECK(ncclGroupEnd()); + } + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) + Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs *args) { + if (blocking_coll) + return testSuccess; + + + int gotCqeCnt = 0; + while (gotCqeCnt < multi_iters) { + for (int i = 0; i < multi_iters; i++) { + pthread_mutex_lock(&cbArgList[i].mutex); + if (cbArgList[i].gotCqe == 1) { + if (seenCqe[i] == 0) { + gotCqeCnt++; + seenCqe[i] = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // if (cudaDev == 0) { + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for collId %d", pthread_self(), cudaDev, i); + // } + + } + } + pthread_mutex_unlock(&cbArgList[i].mutex); + } + } + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) { + + size_t count = args->nbytes / wordSize(type); + + Barrier(args); + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < iters; iter++) { + + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, in_place, + iter * multi_iters + miter, miter, rankCtx)); + } + + TESTCHECK(completeColl(args)); + + int cudaDev; + cudaGetDevice(&cudaDev); + OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + } + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = + std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec / (iters * agg_iters *multi_iters); + if (cudaGraphLaunches >= 1) + deltaSec = deltaSec / cudaGraphLaunches; + Allreduce(args, &deltaSec, average); + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, + args->nProcs * args->nThreads * args->nGpus); + + Barrier(args); + + ofcclDestroy(rankCtx); + + double maxDelta = 0; + // static __thread int rep = 0; // 为了再次初始化buffer的参数,没用了。 + // rep++; + if (datacheck) { + + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec * 1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) { + int nranks = args->nProcs * args->nGpus * args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, + recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, + &sendInplaceOffset, &recvInplaceOffset, + (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, + const char *typeName, ncclRedOp_t op, const char *opName, + int root, bool is_ofccl) { + // if (is_ofccl) { + // 首先创建ofcclRankCtx_t + int thrdCudaDev; + CUDACHECK(cudaGetDevice(&thrdCudaDev)); + ofcclRankCtx_t rankCtx; + ofcclInitRankCtx(&rankCtx, thrdCudaDev); + + // prepare for all size. op, type traversed in the caller. + // TODO: if we support multi size, each size should use a separate ncclComm + for (size_t size = args->minbytes; size <= args->maxbytes; + size = ((args->stepfactor > 1) ? size * args->stepfactor + : size + args->stepbytes)) { + setupArgs(size, type, args); + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx)); + } + } + + // 在这里完成check数据的准备; + static __thread int rep = 0; + rep++; + if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。 + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), cudaDev); + } + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 + // ofcclFinalizeRankCtx7StartHostThrds(rankCtx); + // } + + // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 + // warmup还是需要开,不然ofccl性能拉胯。 + setupArgs(args->maxbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, 0, + iter * multi_iters + miter, miter, rankCtx)); + } + TESTCHECK(completeColl(args)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), cudaDev, iter, multi_iters); + } + + // Benchmark + // for (size_t size = args->minbytes; size <= args->maxbytes; + // size = ((args->stepfactor > 1) ? size * args->stepfactor + // : size + args->stepbytes)) { + // setupArgs(size, type, args); + print_line_header(max(args->sendBytes, args->expectedBytes), + args->nbytes / wordSize(type), typeName, opName, root); + // TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); + TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边,所以没办法在这里通过调用两次BenchTime来先做out-of-place,再做in-place。像这样的话,可以在BenchTime里加个循环。 + PRINT("\n"); + // } + + // if (is_ofccl) { + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self()); + // ofcclDestroy(rankCtx); // 为了做check,把这个挪到BenchTime里边。 + // } + + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadRunTests"); + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, + test_typenames[nccltype], + (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadInit"); + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs * args->nThreads * args->nGpus; + + // set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { + int rank = args->proc * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i = 0; i < args->nGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void *threadLauncher(void *thread_) { + struct testThread *thread = (struct testThread *)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread *thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, + size_t recvBytes, void **expected, size_t nbytes, + int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + // CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) + CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char *argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + ncclGetVersion(&test_ncclVersion); +#else + test_ncclVersion = NCCL_VERSION_CODE; +#endif +// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, +// test_ncclVersion); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) && + test_ncclVersion >= NCCL_VERSION(2, 10, 0)) { + test_opnum++; // ncclAvg +#if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 +#endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) && + test_ncclVersion >= NCCL_VERSION(2, 11, 0)) { + test_opnum++; // PreMulSum + } +#endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"multi_iters", required_argument, 0, 'M'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {}}; + + while (1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts, + &longindex); + + if (c == -1) + break; + + switch (c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'M': + multi_iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && \ + CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA " + "11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') + printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-M,--multi_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time " + "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, + "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), + MPI_BYTE, MPI_COMM_WORLD); + for (int p = 0; p < nProcs; p++) { + if (p == proc) + break; + if (hostHashs[p] == hostHashs[proc]) + localRank++; + } +#endif + is_main_thread = (proc == 0) ? 1 : 0; + + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup " + "iters: %d iters: %d validation: %d \n", + nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1) ? stepFactor : stepBytes, + (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck); + if (blocking_coll) + PRINT("# Blocking Enabled: wait for completion and barrier after each " + "collective \n"); + if (parallel_init) + PRINT("# Parallel Init Enabled: threads call into NcclInitRank " + "concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i = 0; i < nThreads * nGpus; i++) { + int cudaDev = localRank * nThreads * nGpus + i; + int rank = proc * nThreads * nGpus + i; + cudaDeviceProp prop; + CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); + len += + snprintf(line + len, MAX_LINE - len, + "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank, + getpid(), hostname, cudaDev, prop.pciBusID, prop.name); + maxMem = std::min(maxMem, prop.totalGlobalMem); + } + +#if MPI_SUPPORT + char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL; + // Gather all output in rank order to root (0) + MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, + MPI_COMM_WORLD); + if (proc == 0) { + for (int p = 0; p < nProcs; p++) + PRINT("%s", lines + MAX_LINE * p); + free(lines); + } + MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD); +#else + PRINT("%s", line); +#endif + + // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for + // the rest. + // size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2); + // if (maxBytes > memMaxBytes) { + // maxBytes = memMaxBytes; + // if (proc == 0) + // printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", + // maxBytes); + // } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus * nThreads]; + void *sendbuffs[nGpus * nThreads]; + void *recvbuffs[nGpus * nThreads]; + void *expected[nGpus * nThreads]; + size_t sendBytes, recvBytes; + + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, + (size_t)nProcs * nGpus * nThreads); + + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes, + expected + i, (size_t)maxBytes, + nProcs * nThreads * nGpus)); + CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking)); + } + + // if parallel init is not selected, use main thread to initialize NCCL + // TODO: assign more comms when use multi size. + ncclComm_t *comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + ncclComm_t *adjusted_comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + if (!parallel_init) { + if (nProcs == 1) { + int gpuArray[nGpus * nThreads]; + for (int i = 0; i < nGpus * nThreads; i++) + gpuArray[i] = i; + // OFTEST_LOG1(TEST, "CommInitAll here"); + // use seprate comm + // TODO: we do not support MPI now. + for (int miter = 0; miter < multi_iters; miter++) { + NCCLCHECK( + ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray)); + for (int tid = 0; tid < nThreads; tid++) { + memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus); + } + } + + // for (int miter = 0; miter < multi_iters; miter++) { + // for (int tid = 0; tid < nThreads; tid++) { + // OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus); + // } + // } + // for (int tid = 0; tid < nThreads; tid++) { + // for (int miter = 0; miter < multi_iters; miter++) { + // OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus); + // } + // } + } else { + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId, + proc * nThreads * nGpus + i)); + } + NCCLCHECK(ncclGroupEnd()); + } + } + + int errors[nThreads]; + double bw[nThreads]; + double *delta; + CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS, + cudaHostAllocPortable | cudaHostAllocMapped)); + int bw_count[nThreads]; + for (int t = 0; t < nThreads; t++) { + bw[t] = 0.0; + errors[t] = bw_count[t] = 0; + } + + PRINT("#\n"); + print_header(); + + int *sync = (int *)calloc(2, sizeof(int)); + int *barrier = (int *)calloc(2, sizeof(int)); + double *reduce = (double *)calloc(2, sizeof(double)); + + struct testThread threads[nThreads]; + memset(threads, 0, sizeof(struct testThread) * nThreads); + + for (int t = nThreads - 1; t >= 0; t--) { + threads[t].args.minbytes = minBytes; + threads[t].args.maxbytes = maxBytes; + // TODO: 不支持多个size。 + if (minBytes != maxBytes) { + OFTEST_LOG1(TEST_FATAL, "Only supports single size now"); + return testInternalError; + } + threads[t].args.stepbytes = stepBytes; + threads[t].args.stepfactor = stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs = nProcs; + threads[t].args.proc = proc; + threads[t].args.nThreads = nThreads; + threads[t].args.thread = t; + threads[t].args.nGpus = nGpus; + threads[t].args.sendbuffs = sendbuffs + t * nGpus; + threads[t].args.recvbuffs = sendbuffs + t * nGpus; + threads[t].args.expected = expected + t * nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus; + // for (int i = 0; i < multi_iters * nGpus; i++) { + // OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i); + // } + + threads[t].args.streams = streams + t * nGpus; + + threads[t].args.barrier = (volatile int *)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double *)reduce; + threads[t].args.sync = (volatile int *)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t * NUM_BLOCKS); + threads[t].args.errors = errors + t; + threads[t].args.bw = bw + t; + threads[t].args.bw_count = bw_count + t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads + t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t = nThreads - 1; t >= 0; t--) { + if (t) + pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for (int i = 0; i < nGpus * nThreads; ++i) + NCCLCHECK(ncclCommDestroy(comms[i])); + free(comms); + } + + // Free off CUDA allocated memory + for (int i = 0; i < nGpus * nThreads; i++) { + if (sendbuffs[i]) + CUDACHECK(cudaFree((char *)sendbuffs[i])); + // if (recvbuffs[i]) + // CUDACHECK(cudaFree((char *)recvbuffs[i])); + if (datacheck) + CUDACHECK(cudaFree(expected[i])); + } + CUDACHECK(cudaFreeHost(delta)); + + char *str = getenv("NCCL_TESTS_MIN_BW"); + double check_avg_bw = str ? atof(str) : -1; + bw[0] /= bw_count[0]; + + PRINT("# Out of bounds values : %d %s\n", errors[0], + errors[0] ? "FAILED" : "OK"); + PRINT("# Avg bus bandwidth : %g %s\n", bw[0], + check_avg_bw == -1 ? "" + : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK")); + PRINT("#\n"); +#ifdef MPI_SUPPORT + MPI_Finalize(); +#endif + + // 'cuda-memcheck --leak-check full' requires this + cudaDeviceReset(); + + if (errors[0] || bw[0] < check_avg_bw * (0.9)) + exit(EXIT_FAILURE); + else + exit(EXIT_SUCCESS); +} diff --git a/src_inplace/common_inplace.h b/src_inplace/common_inplace.h new file mode 100644 index 0000000..406f634 --- /dev/null +++ b/src_inplace/common_inplace.h @@ -0,0 +1,289 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#include "nccl.h" +#include +#include +#include +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +#include +#include "nccl1_compat.h" + +// #define DEBUG_PRINT 1 + +#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) +#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) +#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__) + +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test CUDA failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ + return testCudaError; \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ + return testNcclError; \ + } \ +} while(0) + +typedef enum { + testSuccess = 0, + testInternalError = 1, + testCudaError = 2, + testNcclError = 3, +} testResult_t; + +// Relay errors up and trace +#define TESTCHECK(cmd) do { \ + testResult_t r = cmd; \ + if (r!= testSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf(" .. %s pid %d: Test failure %s:%d\n", \ + hostname, getpid(), \ + __FILE__,__LINE__); \ + return r; \ + } \ +} while(0) + +typedef struct { + int collId; + int gotCqe; + pthread_mutex_t mutex; +} CallBackArgs; + +#define MAX_COLL_NUM 10000 + +struct testColl { + const char name[20]; + void (*getCollByteCount)( + size_t *sendcount, size_t *recvcount, size_t *paramcount, + size_t *sendInplaceOffset, size_t *recvInplaceOffset, + size_t count, int nranks); + testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, + ncclRedOp_t op, int root, int rep, int in_place); + void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx); + testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx); +}; +extern struct testColl allReduceTest; +extern struct testColl allGatherTest; +extern struct testColl reduceScatterTest; +extern struct testColl broadcastTest; +extern struct testColl reduceTest; +extern struct testColl alltoAllTest; + +struct testEngine { + void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); + testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, + const char* typeName, ncclRedOp_t op, const char* opName); +}; + +extern struct testEngine ncclTestEngine; + +struct threadArgs { + size_t nbytes; + size_t minbytes; + size_t maxbytes; + size_t stepbytes; + size_t stepfactor; + + int nProcs; + int proc; + int nThreads; + int thread; + int nGpus; + int localRank; + void** sendbuffs; + size_t sendBytes; + size_t sendInplaceOffset; + void** recvbuffs; + size_t recvInplaceOffset; + ncclUniqueId ncclId; + ncclComm_t* comms; + cudaStream_t* streams; + + void** expected; + size_t expectedBytes; + volatile int* sync; + int sync_idx; + volatile int* barrier; + int barrier_idx; + volatile double* reduce; + int syncRank; + int syncNranks; + double* deltaHost; + int* errors; + double* bw; + int* bw_count; + + int reportErrors; + + struct testColl* collTest; +}; + +typedef testResult_t (*threadFunc_t)(struct threadArgs* args); +struct testThread { + pthread_t thread; + threadFunc_t func; + struct threadArgs args; + testResult_t ret; +}; + +#include + +// Provided by common.cu +extern void Barrier(struct threadArgs* args); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, bool is_ofccl=false); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); +extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); + +// Provided by each coll +extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); +extern void print_header(); + +#include + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + +#include + +static uint64_t getHash(const char* string, size_t n) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (size_t c = 0; c < n; c++) { + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + +static size_t wordSize(ncclDataType_t type) { + switch(type) { + case ncclChar: +#if NCCL_MAJOR >= 2 + //case ncclInt8: + case ncclUint8: +#endif + return 1; + case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif + //case ncclFloat16: + return 2; + case ncclInt: + case ncclFloat: +#if NCCL_MAJOR >= 2 + //case ncclInt32: + case ncclUint32: + //case ncclFloat32: +#endif + return 4; + case ncclInt64: + case ncclUint64: + case ncclDouble: + //case ncclFloat64: + return 8; + default: return 0; + } +} + +extern int test_ncclVersion; // init'd with ncclGetVersion() +constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); +extern int test_opnum; +extern int test_typenum; +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[]; +extern const char *test_opnames[]; + +static int ncclstringtotype(char *str) { + for (int t=0; t INT_MAX) return ncclInvalidArgument; + +static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclBcast(buff, (int)count, datatype, root, comm, stream); +} +static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + CHECKCOUNT(recvcount); + return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(sendcount); + return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif diff --git a/src_simple/all_reduce_simple.cu b/src_inplace/ofccl_all_reduce_inp.cu similarity index 63% rename from src_simple/all_reduce_simple.cu rename to src_inplace/ofccl_all_reduce_inp.cu index bdeeb48..9b9c95f 100644 --- a/src_simple/all_reduce_simple.cu +++ b/src_inplace/ofccl_all_reduce_inp.cu @@ -5,14 +5,18 @@ ************************************************************************/ #include "cuda_runtime.h" -#include "common_simple.h" +#include "common_inplace.h" +#include +#include +#include +#include void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n"); PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n"); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { @@ -32,6 +36,9 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; CUDACHECK(cudaSetDevice(gpuid)); @@ -42,6 +49,7 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); CUDACHECK(cudaDeviceSynchronize()); } + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev); return testSuccess; } @@ -53,8 +61,44 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + return 0; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + + // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + + NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); return testSuccess; } @@ -63,7 +107,8 @@ struct testColl allReduceTest = { AllReduceGetCollByteCount, AllReduceInitData, AllReduceGetBw, - AllReduceRunColl + AllReduceRunColl, + AllReducePrepare }; void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { @@ -100,7 +145,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t for (int i=0; i -#include - -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n"); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6s", size, count, typeName, opName); -} - -void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = count; - *recvcount = count; - *sendInplaceOffset = 0; - *recvInplaceOffset = 0; - *paramcount = *sendcount; -} - -testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { - size_t sendcount = args->sendBytes / wordSize(type); - size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; - - for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); - TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); - CUDACHECK(cudaDeviceSynchronize()); - } - return testSuccess; -} - -void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { - double baseBw = (double)(count * typesize) / 1.0E9 / sec; - - *algBw = baseBw; - double factor = ((double)(2*(nranks - 1)))/((double)nranks); - *busBw = baseBw * factor; -} - -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - static int round; - ncclGroupStart(); - printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round); - - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 2nd allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 5th allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 6th allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 7th allreduce\n", getpid(), round); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); - printf("<%d> %d ofccl_nccl_test 8th allreduce\n", getpid(), round); - - ncclGroupEnd(); - printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round); - round++; - return testSuccess; -} - -struct testColl allReduceTest = { - "AllReduce", - AllReduceGetCollByteCount, - AllReduceInitData, - AllReduceGetBw, - AllReduceRunColl -}; - -void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { - size_t paramcount, sendInplaceOffset, recvInplaceOffset; - AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); -} - -testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { - args->collTest = &allReduceTest; - // ncclDataType_t *run_types; - // ncclRedOp_t *run_ops; - // const char **run_typenames, **run_opnames; - // int type_count, op_count; - - // if ((int)type != -1) { - // type_count = 1; - // run_types = &type; - // run_typenames = &typeName; - // } else { - // type_count = test_typenum; - // run_types = test_types; - // run_typenames = test_typenames; - // } - - // if ((int)op != -1) { - // op_count = 1; - // run_ops = &op; - // run_opnames = &opName; - // } else { - // op_count = test_opnum; - // run_ops = test_ops; - // run_opnames = test_opnames; - // } - - // for (int i=0; i %d ofccl_nccl_test invoke TimeTest\n", getpid(), test_round); - test_round++; - TESTCHECK(TimeTest(args, ncclFloat, "float", ncclSum, "sum", -1)); - return testSuccess; -} - -struct testEngine allReduceEngine = { - AllReduceGetBuffSize, - AllReduceRunTest -}; - -#pragma weak ncclTestEngine=allReduceEngine diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index e1149ed..8fc3e4e 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -816,7 +816,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t int cudaDev; cudaGetDevice(&cudaDev); - OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); } auto delta = std::chrono::high_resolution_clock::now() - start; @@ -942,7 +942,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); - // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); + // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边,所以没办法在这里通过调用两次BenchTime来先做out-of-place,再做in-place。像这样的话,可以在BenchTime里加个循环。 PRINT("\n"); // } From eed57ca6337a5f65f4804d1695fa18a349354f6a Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 17 Oct 2022 09:24:29 +0000 Subject: [PATCH 042/109] log format --- src_inplace/common_inplace.cu | 2 +- src_inplace/ofccl_all_reduce_inp.cu | 4 ++-- src_simple/common_simple.cu | 2 +- src_simple/ofccl_all_reduce.cu | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src_inplace/common_inplace.cu b/src_inplace/common_inplace.cu index 023030b..4cb08c3 100644 --- a/src_inplace/common_inplace.cu +++ b/src_inplace/common_inplace.cu @@ -785,7 +785,7 @@ testResult_t completeColl(struct threadArgs *args) { // int cudaDev; // CUDACHECK(cudaGetDevice(&cudaDev)); // if (cudaDev == 0) { - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for collId %d", pthread_self(), cudaDev, i); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i); // } } diff --git a/src_inplace/ofccl_all_reduce_inp.cu b/src_inplace/ofccl_all_reduce_inp.cu index 9b9c95f..9123391 100644 --- a/src_inplace/ofccl_all_reduce_inp.cu +++ b/src_inplace/ofccl_all_reduce_inp.cu @@ -75,7 +75,7 @@ int myCallback(int collIdFromCqe, void *args) { // int cudaDev; // CUDACHECK(cudaGetDevice(&cudaDev)); // int collId = ((CallBackArgs *)args)->collId; - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); return 0; } @@ -89,7 +89,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa pthread_mutex_init(&args->mutex, NULL); NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); return testSuccess; diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 8fc3e4e..b889947 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -785,7 +785,7 @@ testResult_t completeColl(struct threadArgs *args) { // int cudaDev; // CUDACHECK(cudaGetDevice(&cudaDev)); // if (cudaDev == 0) { - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for collId %d", pthread_self(), cudaDev, i); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i); // } } diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 0c5593b..42c9628 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -75,7 +75,7 @@ int myCallback(int collIdFromCqe, void *args) { // int cudaDev; // CUDACHECK(cudaGetDevice(&cudaDev)); // int collId = ((CallBackArgs *)args)->collId; - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); return 0; } @@ -89,7 +89,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa pthread_mutex_init(&args->mutex, NULL); NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); return testSuccess; From 0ef76cc20d6ed5098a9de517d6eb5890b710fce3 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 18 Oct 2022 14:45:24 +0000 Subject: [PATCH 043/109] manual buffer size done --- src_manual_size/Makefile | 109 ++ src_manual_size/common_ms.cu | 1496 ++++++++++++++++++++++++ src_manual_size/common_ms.h | 292 +++++ src_manual_size/nccl1_compat.h | 50 + src_manual_size/ofccl_all_reduce_ms.cu | 173 +++ src_simple/common_simple.cu | 25 +- 6 files changed, 2124 insertions(+), 21 deletions(-) create mode 100644 src_manual_size/Makefile create mode 100644 src_manual_size/common_ms.cu create mode 100644 src_manual_size/common_ms.h create mode 100644 src_manual_size/nccl1_compat.h create mode 100644 src_manual_size/ofccl_all_reduce_ms.cu diff --git a/src_manual_size/Makefile b/src_manual_size/Makefile new file mode 100644 index 0000000..ce42152 --- /dev/null +++ b/src_manual_size/Makefile @@ -0,0 +1,109 @@ +# +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +CUDA_HOME ?= /usr/local/cuda +PREFIX ?= /usr/local +VERBOSE ?= 0 +DEBUG ?= 1 + +CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include +NVCC = $(CUDA_HOME)/bin/nvcc +CUDARTLIB ?= cudart + +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) +else +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) +endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt + +ifeq ($(DEBUG), 0) +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else +.SILENT: +endif + +.PHONY: build clean + +BUILDDIR ?= ../build +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +ifeq ($(MPI), 1) +NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include +NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi +endif +ifeq ($(MPI_IBM),1) +NVCUFLAGS += -DMPI_SUPPORT +NVLDFLAGS += -lmpi_ibm +endif +LIBRARIES += nccl +NVLDFLAGS += $(LIBRARIES:%=-l%) + +$(info CARDNAME $(NVCUFLAGS)) + +DST_DIR := $(BUILDDIR) +SRC_FILES := $(wildcard *.cu) +OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) +BIN_FILES_LIST := ofccl_all_reduce_ms +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) + +build: ${BIN_FILES} + +clean: + rm -rf ${DST_DIR} + +${DST_DIR}/%.o: %.cu common_ms.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_ms.o + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu new file mode 100644 index 0000000..f240087 --- /dev/null +++ b/src_manual_size/common_ms.cu @@ -0,0 +1,1496 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "common_ms.h" +#include "cuda.h" +#include "nccl.h" +#include +#include +#include +#include +#include + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +// TODO: 丑丑地搞个全局变量 +// size_t countList[MULTI_ITERS] = {4000, 8192000}; +size_t countList[MULTI_ITERS] = {4000, 8192000}; +size_t sendBytesList[MULTI_ITERS]; +size_t recvBytesList[MULTI_ITERS]; + +#if NCCL_MAJOR >= 2 +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, + ncclUint8, + ncclInt32, + ncclUint32, + ncclInt64, + ncclUint64, + ncclHalf, + ncclFloat, + ncclDouble +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclBfloat16 +#endif +}; +const char *test_typenames[ncclNumTypes] = {"int8", + "uint8", + "int32", + "uint32", + "int64", + "uint64", + "half", + "float", + "double" +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + "bfloat16" +#endif +}; +int test_typenum = -1; + +const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; +ncclRedOp_t test_ops[] = { + ncclSum, + ncclProd, + ncclMax, + ncclMin +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclAvg +#endif +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + , + ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand +#endif +}; +int test_opnum = -1; +#else +ncclDataType_t test_types[ncclNumTypes] = { + ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; +const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", + "double", "int64", "uint64"}; +int test_typenum = 7; +const char *test_opnames[] = {"sum", "prod", "max", "min"}; +ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; +int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32 * 1024 * 1024; +static size_t maxBytes = 32 * 1024 * 1024; +static size_t stepBytes = 1 * 1024 * 1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = 1; +static int multi_iters = MULTI_ITERS; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static thread_local CallBackArgs cbArgList[MAX_COLL_NUM]; +static thread_local int seenCqe[MAX_COLL_NUM]; + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024 * 1024 * 1024; + break; + case 'M': + case 'm': + units = 1024 * 1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch (type) { + case ncclHalf: + return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + return 1e-2; +#endif + case ncclFloat: + return 1e-5; + case ncclDouble: + return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + // case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: + return 1e-200; + } + return 1e-200; +} + +template __device__ double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template <> __device__ double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y - x)); +} + +template __device__ float toFloat(T a) { return (float)a; } +template <> __device__ float toFloat(half a) { return __half2float(a); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template +__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) { + const T *A = (const T *)A_; + const T *B = (const T *)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + double locmax = 0.0; + for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) { + + double delta = absDiff(A[i], B[i]); + if (delta > locmax) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) + printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i, + toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for (int stride = BSIZE / 2; stride > 1; stride >>= 1) { + __syncthreads(); + if (tid < stride) + temp[tid] = + temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride]; + } + __syncthreads(); + if (threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep + rank + offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template <> +__device__ double testValue(const size_t offset, const int rep, + const int rank) { + return 1.0 / (1.0 + (double)testValue(offset, rep, rank)); +} +template <> +__device__ float testValue(const size_t offset, const int rep, + const int rank) { + // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 + return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + // return 1.0 / 1.0; +} +template <> +__device__ half testValue(const size_t offset, const int rep, + const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, + const int rep, + const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template __device__ T ncclOpSum(T a, T b) { return a + b; } +template __device__ T ncclOpProd(T a, T b) { return a * b; } +template __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; } +template __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; } + +// Definitions for half +template <> __device__ half ncclOpSum(half a, half b) { + return __float2half(__half2float(a) + __half2float(b)); +} +template <> __device__ half ncclOpProd(half a, half b) { + return __float2half(__half2float(a) * __half2float(b)); +} +template <> __device__ half ncclOpMax(half a, half b) { + return __half2float(a) > __half2float(b) ? a : b; +} +template <> __device__ half ncclOpMin(half a, half b) { + return __half2float(a) < __half2float(b) ? a : b; +} + +template __device__ T ncclPPOpIdent(T x, int arg) { return x; } +template __device__ T ncclPPOpMul(T x, int arg) { + return x * T(arg); +} +template __device__ T ncclPPOpDiv(T x, int arg) { + return x / T(arg); +} +template <> __device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x) * float(arg)); +} +template <> __device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x) / n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x) * float(arg)); +} +template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x) / n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; } + +template +__global__ void InitDataReduceKernel(T *data, const size_t N, + const size_t offset, const int rep, + const int nranks) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) { + T val = testValue(o + offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i = 1; i < nranks; i++) { + T val1 = testValue(o + offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) \ + (void *)InitDataReduceKernel, preop, postop> +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), + OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void *data, const size_t count, const size_t offset, + ncclDataType_t type, ncclRedOp_t op, const int rep, + const int nranks) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep, + (void *)&nranks}; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid, + block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T *data, const size_t N, const int rep, + const int rank) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) + data[o] = testValue(o, rep, rank); +} + +static void *const initDataKerns[ncclNumTypes] = { + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + (void *)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void *dest, const size_t N, const int rep, + const int rank) { + T *ptr = (T *)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void *data, const size_t count, ncclDataType_t type, + const int rep, const int rank) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank}; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs *args) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + args->barrier_idx = !args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs *args, double *value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) + val += val2; + if (average == 2) + val = std::min(val, val2); + if (average == 3) + val = std::max(val, val2); + } + if (average || args->thread == 0) + args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1, + MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) + args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads; + args->reduce[1 - args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx = !args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams, + ncclComm_t *comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int *done = (int *)malloc(sizeof(int) * ngpus); + memset(done, 0, sizeof(int) * ngpus); + while (remaining) { + int idle = 1; + for (int i = 0; i < ngpus; i++) { + if (done[i]) + continue; + + cudaErr = cudaStreamQuery(streams[i]); + if (cudaErr == cudaSuccess) { + done[i] = 1; + remaining--; + idle = 0; + continue; + } + + if (cudaErr != cudaErrorNotReady) + CUDACHECK(cudaErr); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i = 0; i < ngpus; i++) + NCCLCHECK(ncclCommAbort(comms[i])); + // Abort the perf test + NCCLCHECK(ncclAsyncErr); + } + } +#endif + } + + // We might want to let other threads (including NCCL threads) use the CPU. + if (idle) + pthread_yield(); + } + free(done); + return testSuccess; +} + +testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + if (args->nGpus != 1) { + OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs"); + return testInternalError; + } + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); + + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + + return testSuccess; +} + +testResult_t startColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart"); + NCCLCHECK(ncclGroupStart()); + } + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm); +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(comm, &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + // char *recvBuff = ((char *)args->recvbuffs[i]) + shift; + // char *sendBuff = ((char *)args->sendbuffs[i]) + shift; + char *recvBuff = (char *)(args->recvbuffs[miter]); + char *sendBuff = (char *)(args->sendbuffs[miter]); + + // int cudaDev; + // cudaGetDevice(&cudaDev); + // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, RUN sendbuff @ %p, recvbuff @ %p", cudaDev, miter, sendBuff, recvBuff); + + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + // miter就是collId。 + TESTCHECK(args->collTest->runColl( + (void *)(sendBuff), + (void *)(recvBuff), miter, cbArgList + miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd"); + NCCLCHECK(ncclGroupEnd()); + } + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) + Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs *args) { + if (blocking_coll) + return testSuccess; + + + int gotCqeCnt = 0; + while (gotCqeCnt < multi_iters) { + for (int i = 0; i < multi_iters; i++) { + pthread_mutex_lock(&cbArgList[i].mutex); + if (cbArgList[i].gotCqe == 1) { + if (seenCqe[i] == 0) { + gotCqeCnt++; + seenCqe[i] = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // if (cudaDev == 0) { + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i); + // } + + } + } + pthread_mutex_unlock(&cbArgList[i].mutex); + } + } + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) { + + size_t count = args->nbytes / wordSize(type); + + Barrier(args); + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < iters; iter++) { + + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, in_place, + iter * multi_iters + miter, miter, rankCtx)); + } + + TESTCHECK(completeColl(args)); + + int cudaDev; + cudaGetDevice(&cudaDev); + OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + } + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = + std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec / (iters * agg_iters *multi_iters); + if (cudaGraphLaunches >= 1) + deltaSec = deltaSec / cudaGraphLaunches; + Allreduce(args, &deltaSec, average); + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, + args->nProcs * args->nThreads * args->nGpus); + + Barrier(args); + + ofcclDestroy(rankCtx); + + double maxDelta = 0; + // static __thread int rep = 0; // 为了再次初始化buffer的参数,没用了。 + // rep++; + if (datacheck) { + + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec * 1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) { + int nranks = args->nProcs * args->nGpus * args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, + recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, + &sendInplaceOffset, &recvInplaceOffset, + (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, + const char *typeName, ncclRedOp_t op, const char *opName, + int root, bool is_ofccl) { + // 首先创建ofcclRankCtx_t + int thrdCudaDev; + CUDACHECK(cudaGetDevice(&thrdCudaDev)); + ofcclRankCtx_t rankCtx; + ofcclInitRankCtx(&rankCtx, thrdCudaDev); + + // prepare for all size. op, type traversed in the caller. + // TODO: if we support multi size, each size should use a separate ncclComm + + for (int miter = 0; miter < multi_iters; miter++) { + args->nbytes = sendBytesList[miter]; + args->sendBytes = args->nbytes; + TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx)); + } + + // 在这里完成check数据的准备; + static __thread int rep = 0; + rep++; + if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。 + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0)); + + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev); + } + + // ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 + ofcclFinalizeRankCtx7StartHostThrds(rankCtx); + + // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 + // warmup还是需要开,不然ofccl性能拉胯。 + for (int iter = 0; iter < warmup_iters; iter++) { + for (int miter = 0; miter < multi_iters; miter++) { + args->nbytes = sendBytesList[miter]; + args->sendBytes = args->nbytes; + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, 0, + iter * multi_iters + miter, miter, rankCtx)); + } + TESTCHECK(completeColl(args)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), thrdCudaDev, iter, multi_iters); + } + + print_line_header(max(args->sendBytes, args->expectedBytes), + args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); + // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边,所以没办法在这里通过调用两次BenchTime来先做out-of-place,再做in-place。像这样的话,可以在BenchTime里加个循环。 + PRINT("\n"); + + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadRunTests"); + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, + test_typenames[nccltype], + (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadInit"); + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs * args->nThreads * args->nGpus; + + // set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { + int rank = args->proc * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i = 0; i < args->nGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void *threadLauncher(void *thread_) { + struct testThread *thread = (struct testThread *)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread *thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, + size_t recvBytes, void **expected, size_t nbytes, + int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) + CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t AllocateBuffLists(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes) { + CUDACHECK(cudaMalloc(sendbuff, sendBytes)); + CUDACHECK(cudaMalloc(recvbuff, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char *argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + ncclGetVersion(&test_ncclVersion); +#else + test_ncclVersion = NCCL_VERSION_CODE; +#endif +// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, +// test_ncclVersion); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) && + test_ncclVersion >= NCCL_VERSION(2, 10, 0)) { + test_opnum++; // ncclAvg +#if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 +#endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) && + test_ncclVersion >= NCCL_VERSION(2, 11, 0)) { + test_opnum++; // PreMulSum + } +#endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"multi_iters", required_argument, 0, 'M'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {}}; + + while (1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts, + &longindex); + + if (c == -1) + break; + + switch (c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'M': + multi_iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && \ + CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA " + "11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') + printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-M,--multi_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time " + "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, + "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), + MPI_BYTE, MPI_COMM_WORLD); + for (int p = 0; p < nProcs; p++) { + if (p == proc) + break; + if (hostHashs[p] == hostHashs[proc]) + localRank++; + } +#endif + is_main_thread = (proc == 0) ? 1 : 0; + + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup " + "iters: %d iters: %d validation: %d \n", + nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1) ? stepFactor : stepBytes, + (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck); + if (blocking_coll) + PRINT("# Blocking Enabled: wait for completion and barrier after each " + "collective \n"); + if (parallel_init) + PRINT("# Parallel Init Enabled: threads call into NcclInitRank " + "concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + if (multi_iters != 2) { + // TODO: he is only a baby T^T + OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters); + } + OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i = 0; i < nThreads * nGpus; i++) { + int cudaDev = localRank * nThreads * nGpus + i; + int rank = proc * nThreads * nGpus + i; + cudaDeviceProp prop; + CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); + len += + snprintf(line + len, MAX_LINE - len, + "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank, + getpid(), hostname, cudaDev, prop.pciBusID, prop.name); + maxMem = std::min(maxMem, prop.totalGlobalMem); + } + +#if MPI_SUPPORT + char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL; + // Gather all output in rank order to root (0) + MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, + MPI_COMM_WORLD); + if (proc == 0) { + for (int p = 0; p < nProcs; p++) + PRINT("%s", lines + MAX_LINE * p); + free(lines); + } + MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD); +#else + PRINT("%s", line); +#endif + + // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for + // the rest. + size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2); + if (maxBytes > memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) + printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", + maxBytes); + } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus * nThreads]; + void *sendbuffs[nGpus * nThreads][MULTI_ITERS]; + void *recvbuffs[nGpus * nThreads][MULTI_ITERS]; + void *expected[nGpus * nThreads]; + // size_t sendBytes, recvBytes; + + // ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, + // (size_t)nProcs * nGpus * nThreads); + + ncclTestEngine.getCollByteCountList(sendBytesList, recvBytesList, countList, multi_iters); + // for (int i = 0; i < MULTI_ITERS; i++) { + // OFTEST_LOG(TEST, "sendBytesList[%d] = %lu, recvBytesList[%d] = %lu", i, sendBytesList[i], i, recvBytesList[i]); + // } + + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + // 这里的调用是给每个线程分配。 + // TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes, + // expected + i, (size_t)maxBytes, + // nProcs * nThreads * nGpus)); + CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking)); + + for (int j = 0; j < multi_iters; j++) { + AllocateBuffLists(&sendbuffs[i][j], sendBytesList[j], &recvbuffs[i][j], recvBytesList[j]); + + // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, ALLOCATE sendbuff @ %p, recvbuff @ %p", i, j, sendbuffs[i][j], recvbuffs[i][j]); + } + } + + // if parallel init is not selected, use main thread to initialize NCCL + // TODO: assign more comms when use multi size. + ncclComm_t *comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + ncclComm_t *adjusted_comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + if (!parallel_init) { + if (nProcs == 1) { + int gpuArray[nGpus * nThreads]; + for (int i = 0; i < nGpus * nThreads; i++) + gpuArray[i] = i; + // OFTEST_LOG1(TEST, "CommInitAll here"); + // use seprate comm + // TODO: we do not support MPI now. + for (int miter = 0; miter < multi_iters; miter++) { + NCCLCHECK( + ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray)); + for (int tid = 0; tid < nThreads; tid++) { + memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus); + } + } + + // for (int miter = 0; miter < multi_iters; miter++) { + // for (int tid = 0; tid < nThreads; tid++) { + // OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus); + // } + // } + // for (int tid = 0; tid < nThreads; tid++) { + // for (int miter = 0; miter < multi_iters; miter++) { + // OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus); + // } + // } + } else { + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId, + proc * nThreads * nGpus + i)); + } + NCCLCHECK(ncclGroupEnd()); + } + } + + int errors[nThreads]; + double bw[nThreads]; + double *delta; + CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS, + cudaHostAllocPortable | cudaHostAllocMapped)); + int bw_count[nThreads]; + for (int t = 0; t < nThreads; t++) { + bw[t] = 0.0; + errors[t] = bw_count[t] = 0; + } + + PRINT("#\n"); + print_header(); + + int *sync = (int *)calloc(2, sizeof(int)); + int *barrier = (int *)calloc(2, sizeof(int)); + double *reduce = (double *)calloc(2, sizeof(double)); + + struct testThread threads[nThreads]; + memset(threads, 0, sizeof(struct testThread) * nThreads); + + for (int t = nThreads - 1; t >= 0; t--) { + threads[t].args.minbytes = minBytes; + threads[t].args.maxbytes = maxBytes; + // TODO: 不支持多个size。 + if (minBytes != maxBytes) { + OFTEST_LOG1(TEST_FATAL, "Only supports single size now"); + return testInternalError; + } + threads[t].args.stepbytes = stepBytes; + threads[t].args.stepfactor = stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs = nProcs; + threads[t].args.proc = proc; + threads[t].args.nThreads = nThreads; + threads[t].args.thread = t; + threads[t].args.nGpus = nGpus; + // threads[t].args.sendbuffs = sendbuffs[t]; + // threads[t].args.recvbuffs = recvbuffs[t]; + for (int j = 0; j < MULTI_ITERS; j++) { + threads[t].args.sendbuffs[j] = sendbuffs[t][j]; + threads[t].args.recvbuffs[j] = recvbuffs[t][j]; + // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, DISPATCH SRC sendbuff @ %p, recvbuff @ %p", t, j, sendbuffs[t][j], recvbuffs[t][j]); + // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, DISPATCH IN ARGS sendbuff @ %p, recvbuff @ %p", t, j, threads[t].args.sendbuffs[j], threads[t].args.recvbuffs[j]); + } + threads[t].args.expected = expected + t * nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus; + // for (int i = 0; i < multi_iters * nGpus; i++) { + // OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i); + // } + + threads[t].args.streams = streams + t * nGpus; + + threads[t].args.barrier = (volatile int *)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double *)reduce; + threads[t].args.sync = (volatile int *)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t * NUM_BLOCKS); + threads[t].args.errors = errors + t; + threads[t].args.bw = bw + t; + threads[t].args.bw_count = bw_count + t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads + t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t = nThreads - 1; t >= 0; t--) { + if (t) + pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for (int i = 0; i < nGpus * nThreads; ++i) + NCCLCHECK(ncclCommDestroy(comms[i])); + free(comms); + } + + // Free off CUDA allocated memory + for (int i = 0; i < nGpus * nThreads; i++) { + for (int j = 0; j < MULTI_ITERS; j++) { + CUDACHECK(cudaFree((char *)sendbuffs[i][j])); + CUDACHECK(cudaFree((char *)recvbuffs[i][j])); + } + } + CUDACHECK(cudaFreeHost(delta)); + + char *str = getenv("NCCL_TESTS_MIN_BW"); + double check_avg_bw = str ? atof(str) : -1; + bw[0] /= bw_count[0]; + + PRINT("# Out of bounds values : %d %s\n", errors[0], + errors[0] ? "FAILED" : "OK"); + PRINT("# Avg bus bandwidth : %g %s\n", bw[0], + check_avg_bw == -1 ? "" + : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK")); + PRINT("#\n"); +#ifdef MPI_SUPPORT + MPI_Finalize(); +#endif + + // 'cuda-memcheck --leak-check full' requires this + cudaDeviceReset(); + + if (errors[0] || bw[0] < check_avg_bw * (0.9)) + exit(EXIT_FAILURE); + else + exit(EXIT_SUCCESS); +} diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h new file mode 100644 index 0000000..c9a477d --- /dev/null +++ b/src_manual_size/common_ms.h @@ -0,0 +1,292 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#include "nccl.h" +#include +#include +#include +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +#include +#include "nccl1_compat.h" + +// #define DEBUG_PRINT 1 + +#define MULTI_ITERS 2 + +#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) +#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) +#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__) + +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test CUDA failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ + return testCudaError; \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ + return testNcclError; \ + } \ +} while(0) + +typedef enum { + testSuccess = 0, + testInternalError = 1, + testCudaError = 2, + testNcclError = 3, +} testResult_t; + +// Relay errors up and trace +#define TESTCHECK(cmd) do { \ + testResult_t r = cmd; \ + if (r!= testSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf(" .. %s pid %d: Test failure %s:%d\n", \ + hostname, getpid(), \ + __FILE__,__LINE__); \ + return r; \ + } \ +} while(0) + +typedef struct { + int collId; + int gotCqe; + pthread_mutex_t mutex; +} CallBackArgs; + +#define MAX_COLL_NUM 10000 + +struct testColl { + const char name[20]; + void (*getCollByteCount)( + size_t *sendcount, size_t *recvcount, size_t *paramcount, + size_t *sendInplaceOffset, size_t *recvInplaceOffset, + size_t count, int nranks); + testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, + ncclRedOp_t op, int root, int rep, int in_place); + void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx); + testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx); +}; +extern struct testColl allReduceTest; +extern struct testColl allGatherTest; +extern struct testColl reduceScatterTest; +extern struct testColl broadcastTest; +extern struct testColl reduceTest; +extern struct testColl alltoAllTest; + +struct testEngine { + void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); + testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, + const char* typeName, ncclRedOp_t op, const char* opName); + void (*getCollByteCountList)(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen); +}; + +extern struct testEngine ncclTestEngine; + +struct threadArgs { + size_t nbytes; + size_t minbytes; + size_t maxbytes; + size_t stepbytes; + size_t stepfactor; + + int nProcs; + int proc; + int nThreads; + int thread; + int nGpus; + int localRank; + void* sendbuffs[MULTI_ITERS]; + size_t sendBytes; + size_t sendInplaceOffset; + void* recvbuffs[MULTI_ITERS]; + size_t recvInplaceOffset; + ncclUniqueId ncclId; + ncclComm_t* comms; + cudaStream_t* streams; + + void** expected; + size_t expectedBytes; + volatile int* sync; + int sync_idx; + volatile int* barrier; + int barrier_idx; + volatile double* reduce; + int syncRank; + int syncNranks; + double* deltaHost; + int* errors; + double* bw; + int* bw_count; + + int reportErrors; + + struct testColl* collTest; +}; + +typedef testResult_t (*threadFunc_t)(struct threadArgs* args); +struct testThread { + pthread_t thread; + threadFunc_t func; + struct threadArgs args; + testResult_t ret; +}; + +#include + +// Provided by common.cu +extern void Barrier(struct threadArgs* args); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, bool is_ofccl=false); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); +extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); + +// Provided by each coll +extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); +extern void print_header(); + +#include + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + +#include + +static uint64_t getHash(const char* string, size_t n) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (size_t c = 0; c < n; c++) { + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + +static size_t wordSize(ncclDataType_t type) { + switch(type) { + case ncclChar: +#if NCCL_MAJOR >= 2 + //case ncclInt8: + case ncclUint8: +#endif + return 1; + case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif + //case ncclFloat16: + return 2; + case ncclInt: + case ncclFloat: +#if NCCL_MAJOR >= 2 + //case ncclInt32: + case ncclUint32: + //case ncclFloat32: +#endif + return 4; + case ncclInt64: + case ncclUint64: + case ncclDouble: + //case ncclFloat64: + return 8; + default: return 0; + } +} + +extern int test_ncclVersion; // init'd with ncclGetVersion() +constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); +extern int test_opnum; +extern int test_typenum; +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[]; +extern const char *test_opnames[]; + +static int ncclstringtotype(char *str) { + for (int t=0; t INT_MAX) return ncclInvalidArgument; + +static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclBcast(buff, (int)count, datatype, root, comm, stream); +} +static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + CHECKCOUNT(recvcount); + return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(sendcount); + return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu new file mode 100644 index 0000000..d0fafb0 --- /dev/null +++ b/src_manual_size/ofccl_all_reduce_ms.cu @@ -0,0 +1,173 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common_ms.h" +#include +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n"); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + int cudaDev; + cudaGetDevice(&cudaDev); + OFTEST_LOG(TEST, "Hi <%lu> Rank<%d>, sendcount = %p, recvcount = %p, paramcount = %p, sendInplaceOffset = %p, recvInplaceOffset = %p, count = %lu, nranks = %d", pthread_self(), cudaDev, sendcount, recvcount, paramcount, sendInplaceOffset, recvInplaceOffset, count, nranks); + + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +void AllReduceGetCollByteCountList(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen) { // listLen就等于multi_iter + // OFTEST_LOG1(TEST, "hi"); + for (int i = 0; i < listLen; i++) { + *(sendCntList + i) = *(countList + i); + *(recvCntList + i) = *(countList + i); + } +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev); + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); + return 0; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + + // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + + NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl, + AllReducePrepare +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; icollTest->initData(args, type, op, root, rep, 0)); - // int cudaDev; - // CUDACHECK(cudaGetDevice(&cudaDev)); - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), cudaDev); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev); } - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 - // ofcclFinalizeRankCtx7StartHostThrds(rankCtx); - // } + // ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 + ofcclFinalizeRankCtx7StartHostThrds(rankCtx); // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 // warmup还是需要开,不然ofccl性能拉胯。 @@ -931,25 +925,14 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, iter * multi_iters + miter, miter, rankCtx)); } TESTCHECK(completeColl(args)); - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), cudaDev, iter, multi_iters); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), thrdCudaDev, iter, multi_iters); } - // Benchmark - // for (size_t size = args->minbytes; size <= args->maxbytes; - // size = ((args->stepfactor > 1) ? size * args->stepfactor - // : size + args->stepbytes)) { - // setupArgs(size, type, args); print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边,所以没办法在这里通过调用两次BenchTime来先做out-of-place,再做in-place。像这样的话,可以在BenchTime里加个循环。 PRINT("\n"); - // } - - // if (is_ofccl) { - // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self()); - // ofcclDestroy(rankCtx); // 为了做check,把这个挪到BenchTime里边。 - // } return testSuccess; } From d6cad8e8521fc1edc6d59415053578012c4c9791 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Wed, 19 Oct 2022 08:07:14 +0000 Subject: [PATCH 044/109] adjust log --- src_inplace/common_inplace.cu | 2 +- src_manual_size/common_ms.cu | 2 +- src_manual_size/ofccl_all_reduce_ms.cu | 8 ++++---- src_simple/common_simple.cu | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src_inplace/common_inplace.cu b/src_inplace/common_inplace.cu index 4cb08c3..22cfecb 100644 --- a/src_inplace/common_inplace.cu +++ b/src_inplace/common_inplace.cu @@ -816,7 +816,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t int cudaDev; cudaGetDevice(&cudaDev); - OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); } auto delta = std::chrono::high_resolution_clock::now() - start; diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index f240087..08687bb 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -827,7 +827,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t int cudaDev; cudaGetDevice(&cudaDev); - OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); } auto delta = std::chrono::high_resolution_clock::now() - start; diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu index d0fafb0..2d925f3 100644 --- a/src_manual_size/ofccl_all_reduce_ms.cu +++ b/src_manual_size/ofccl_all_reduce_ms.cu @@ -84,10 +84,10 @@ int myCallback(int collIdFromCqe, void *args) { ((CallBackArgs *)args)->gotCqe = 1; pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); - // int cudaDev; - // CUDACHECK(cudaGetDevice(&cudaDev)); - // int collId = ((CallBackArgs *)args)->collId; - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + int collId = ((CallBackArgs *)args)->collId; + OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); return 0; } diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 5a0824a..42cbe1c 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -816,7 +816,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t int cudaDev; cudaGetDevice(&cudaDev); - OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); } auto delta = std::chrono::high_resolution_clock::now() - start; From cec88ef5e2c75f11d4b343e0d6b158b59bfe0dd8 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 20 Oct 2022 05:28:04 +0000 Subject: [PATCH 045/109] + nccl_manual_size --- src_nccl_manual_size/Makefile | 109 ++ src_nccl_manual_size/all_reduce_nccl_ms.cu | 114 ++ src_nccl_manual_size/common_nccl_ms.cu | 1173 ++++++++++++++++++++ src_nccl_manual_size/common_nccl_ms.h | 275 +++++ src_nccl_manual_size/nccl1_compat.h | 50 + 5 files changed, 1721 insertions(+) create mode 100644 src_nccl_manual_size/Makefile create mode 100644 src_nccl_manual_size/all_reduce_nccl_ms.cu create mode 100644 src_nccl_manual_size/common_nccl_ms.cu create mode 100644 src_nccl_manual_size/common_nccl_ms.h create mode 100644 src_nccl_manual_size/nccl1_compat.h diff --git a/src_nccl_manual_size/Makefile b/src_nccl_manual_size/Makefile new file mode 100644 index 0000000..4a67159 --- /dev/null +++ b/src_nccl_manual_size/Makefile @@ -0,0 +1,109 @@ +# +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +CUDA_HOME ?= /usr/local/cuda +PREFIX ?= /usr/local +VERBOSE ?= 0 +DEBUG ?= 1 + +CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include +NVCC = $(CUDA_HOME)/bin/nvcc +CUDARTLIB ?= cudart + +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) +else +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) +endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt + +ifeq ($(DEBUG), 0) +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else +.SILENT: +endif + +.PHONY: build clean + +BUILDDIR ?= ../build +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +ifeq ($(MPI), 1) +NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include +NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi +endif +ifeq ($(MPI_IBM),1) +NVCUFLAGS += -DMPI_SUPPORT +NVLDFLAGS += -lmpi_ibm +endif +LIBRARIES += nccl +NVLDFLAGS += $(LIBRARIES:%=-l%) + +$(info CARDNAME $(NVCUFLAGS)) + +DST_DIR := $(BUILDDIR) +SRC_FILES := $(wildcard *.cu) +OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) +BIN_FILES_LIST := all_reduce_nccl_ms +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) + +build: ${BIN_FILES} + +clean: + rm -rf ${DST_DIR} + +${DST_DIR}/%.o: %.cu common_nccl_ms.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_nccl_ms.o + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + diff --git a/src_nccl_manual_size/all_reduce_nccl_ms.cu b/src_nccl_manual_size/all_reduce_nccl_ms.cu new file mode 100644 index 0000000..95d7b28 --- /dev/null +++ b/src_nccl_manual_size/all_reduce_nccl_ms.cu @@ -0,0 +1,114 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common_nccl_ms.h" + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i +#include +#include +#include +#include "cuda.h" + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +#if NCCL_MAJOR >= 2 + ncclDataType_t test_types[ncclNumTypes] = { + ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclBfloat16 + #endif + }; + const char *test_typenames[ncclNumTypes] = { + "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , "bfloat16" + #endif + }; + int test_typenum = -1; + + const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclAvg + #endif + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand + #endif + }; + int test_opnum = -1; +#else + ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; + const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; + int test_typenum = 7; + const char *test_opnames[] = {"sum", "prod", "max", "min"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; + int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32*1024*1024; +static size_t maxBytes = 32*1024*1024; +static size_t stepBytes = 1*1024*1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = 1; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024*1024*1024; + break; + case 'M': + case 'm': + units = 1024*1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch(type) { + case ncclHalf: return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: return 1e-2; +#endif + case ncclFloat: return 1e-5; + case ncclDouble: return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + //case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: return 1e-200; + } + return 1e-200; +} + +template __device__ +double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template<> __device__ +double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y-x)); +} + +template __device__ +float toFloat(T a) { + return (float)a; +} +template<> __device__ +float toFloat(half a) { + return __half2float(a); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> __device__ +float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template __global__ +void deltaKern(void* A_, void* B_, size_t count, double* max) { + const T* A = (const T*)A_; + const T* B = (const T*)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x*blockDim.x + threadIdx.x; + double locmax = 0.0; + for(size_t i=tid; i locmax ) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for(int stride = BSIZE/2; stride > 1; stride>>=1) { + __syncthreads(); + if( tid < stride ) + temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; + } + __syncthreads(); + if( threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep+rank+offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template<> +__device__ double testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(double)testValue(offset, rep, rank)); +} +template<> +__device__ float testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(float)testValue(offset, rep, rank)); +} +template<> +__device__ half testValue(const size_t offset, const int rep, const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template +__device__ T ncclOpSum(T a, T b) { return a+b; } +template +__device__ T ncclOpProd(T a, T b) { return a*b; } +template +__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } +template +__device__ T ncclOpMin(T a, T b) { return a +__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } +template<> +__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } +template<> +__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } +template<> +__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } + +template +__device__ T ncclPPOpIdent(T x, int arg) { return x; } +template +__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } +template +__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } +template<> +__device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x)*float(arg)); +} +template<> +__device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x)/n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x)*float(arg)); +} +template<> +__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x)/n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { + return 1 + rank%2; +} + +template +__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i=1; i(o+offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); +} + +static void* const initDataKerns[ncclNumTypes] = { + (void*)InitDataKernel< int8_t>, + (void*)InitDataKernel< uint8_t>, + (void*)InitDataKernel< int32_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< int64_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< half>, + (void*)InitDataKernel< float>, + (void*)InitDataKernel< double>, +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + (void*)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { + T* ptr = (T*)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs* args) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + args->barrier_idx=!args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs* args, double* value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) val += val2; + if (average == 2) val = std::min(val, val2); + if (average == 3) val = std::max(val, val2); + } + if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; + args->reduce[1-args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx=!args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int* done = (int*)malloc(sizeof(int)*ngpus); + memset(done, 0, sizeof(int)*ngpus); + while (remaining) { + int idle = 1; + for (int i=0; i= NCCL_VERSION(2,4,0) + if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i=0; inbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange + size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + char* recvBuff = ((char*)args->recvbuffs[i]) + shift; + char* sendBuff = ((char*)args->sendbuffs[i]) + shift; + ncclRedOp_t op; + + if(opIndex < ncclNumOps) { + op = opIndex; + } + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + else { + union { + int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; + half f16; float f32; double f64; + #if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; + #endif + }; + int scalar = preMulScalar(rank); + switch(type) { + case ncclInt8: i8 = int8_t(scalar); break; + case ncclUint8: u8 = uint8_t(scalar); break; + case ncclInt32: i32 = int32_t(scalar); break; + case ncclUint32: u32 = uint32_t(scalar); break; + case ncclInt64: i64 = int32_t(scalar); break; + case ncclUint64: u64 = uint32_t(scalar); break; + case ncclFloat16: f16 = __float2half(float(scalar)); break; + case ncclFloat32: f32 = float(scalar); break; + case ncclFloat64: f64 = double(scalar); break; + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; + #endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); + } + #endif + + TESTCHECK(args->collTest->runColl( + (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), + (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), + count, type, op, root, args->comms[i], args->streams[i])); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + if(opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); + } + #endif + } + if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd()); + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs* args) { + if (blocking_coll) return testSuccess; + + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { + size_t count = args->nbytes / wordSize(type); + if (datacheck) { // 这里的目的应该是让测带宽跑的coll也使用非0数据。 + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); + } + + // Sync + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + TESTCHECK(completeColl(args)); + + Barrier(args); + +#if CUDART_VERSION >= 11030 + cudaGraph_t graphs[args->nGpus]; + cudaGraphExec_t graphExec[args->nGpus]; + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture + for (int i=0; inGpus; i++) { + // Thread local mode is needed for: + // - Multi-thread mode + // - P2P pre-connect + CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); + } + } +#endif + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < iters; iter++) { + if (agg_iters>1) NCCLCHECK(ncclGroupStart()); + for (int aiter = 0; aiter < agg_iters; aiter++) { + TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); + } + if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); + } + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + // End cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); + } + // Instantiate cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + } + // Resync CPU, restart timing, launch cuda graph + Barrier(args); + start = std::chrono::high_resolution_clock::now(); + for (int l=0; lnGpus; i++) { + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); + } + } + } +#endif + + TESTCHECK(completeColl(args)); + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec/(iters*agg_iters); + if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + Allreduce(args, &deltaSec, average); + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + //destroy cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); + } + } +#endif + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); + + Barrier(args); + + double maxDelta = 0; + static __thread int rep = 0; + rep++; + if (datacheck) { + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec*1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { + int nranks = args->nProcs*args->nGpus*args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { + // Warm-up for large size + setupArgs(args->maxbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + TESTCHECK(startColl(args, type, op, root, 0, iter)); + } + TESTCHECK(completeColl(args)); + + // Warm-up for small size + setupArgs(args->minbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + TESTCHECK(startColl(args, type, op, root, 0, iter)); + } + TESTCHECK(completeColl(args)); + + // Benchmark + for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { + setupArgs(size, type, args); + print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0)); + // TODO: 实测是否恢复? + // TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); + } + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs* args) { + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs* args) { + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + //set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i=0; inGpus; i++) { + int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i=0; inGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void* threadLauncher(void* thread_) { + struct testThread* thread = (struct testThread*)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread* thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char* argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + ncclGetVersion(&test_ncclVersion); + #else + test_ncclVersion = NCCL_VERSION_CODE; + #endif + //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { + test_opnum++; // ncclAvg + #if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 + #endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { + test_opnum++; // PreMulSum + } + #endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {} + }; + + while(1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); + + if (c == -1) + break; + + switch(c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, + (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); + for (int p=0; p 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); + if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i=0; i memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); + } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus*nThreads]; + void* sendbuffs[nGpus*nThreads]; + void* recvbuffs[nGpus*nThreads]; + void* expected[nGpus*nThreads]; + size_t sendBytes, recvBytes; + + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); + + for (int i=0; i=0; t--) { + threads[t].args.minbytes=minBytes; + threads[t].args.maxbytes=maxBytes; + threads[t].args.stepbytes=stepBytes; + threads[t].args.stepfactor=stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs=nProcs; + threads[t].args.proc=proc; + threads[t].args.nThreads=nThreads; + threads[t].args.thread=t; + threads[t].args.nGpus=nGpus; + threads[t].args.sendbuffs = sendbuffs+t*nGpus; + threads[t].args.recvbuffs = recvbuffs+t*nGpus; + threads[t].args.expected = expected+t*nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms=comms+t*nGpus; + threads[t].args.streams=streams+t*nGpus; + + threads[t].args.barrier = (volatile int*)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double*)reduce; + threads[t].args.sync = (volatile int*)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t*NUM_BLOCKS); + threads[t].args.errors=errors+t; + threads[t].args.bw=bw+t; + threads[t].args.bw_count=bw_count+t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads+t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t=nThreads-1; t>=0; t--) { + if (t) pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for(int i=0; i +#include +#include +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +#include +#include "nccl1_compat.h" + +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test CUDA failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ + return testCudaError; \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ + return testNcclError; \ + } \ +} while(0) + +typedef enum { + testSuccess = 0, + testInternalError = 1, + testCudaError = 2, + testNcclError = 3, +} testResult_t; + +// Relay errors up and trace +#define TESTCHECK(cmd) do { \ + testResult_t r = cmd; \ + if (r!= testSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf(" .. %s pid %d: Test failure %s:%d\n", \ + hostname, getpid(), \ + __FILE__,__LINE__); \ + return r; \ + } \ +} while(0) + +struct testColl { + const char name[20]; + void (*getCollByteCount)( + size_t *sendcount, size_t *recvcount, size_t *paramcount, + size_t *sendInplaceOffset, size_t *recvInplaceOffset, + size_t count, int nranks); + testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, + ncclRedOp_t op, int root, int rep, int in_place); + void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +}; +extern struct testColl allReduceTest; +extern struct testColl allGatherTest; +extern struct testColl reduceScatterTest; +extern struct testColl broadcastTest; +extern struct testColl reduceTest; +extern struct testColl alltoAllTest; + +struct testEngine { + void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); + testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, + const char* typeName, ncclRedOp_t op, const char* opName); +}; + +extern struct testEngine ncclTestEngine; + +struct threadArgs { + size_t nbytes; + size_t minbytes; + size_t maxbytes; + size_t stepbytes; + size_t stepfactor; + + int nProcs; + int proc; + int nThreads; + int thread; + int nGpus; + int localRank; + void** sendbuffs; + size_t sendBytes; + size_t sendInplaceOffset; + void** recvbuffs; + size_t recvInplaceOffset; + ncclUniqueId ncclId; + ncclComm_t* comms; + cudaStream_t* streams; + + void** expected; + size_t expectedBytes; + volatile int* sync; + int sync_idx; + volatile int* barrier; + int barrier_idx; + volatile double* reduce; + int syncRank; + int syncNranks; + double* deltaHost; + int* errors; + double* bw; + int* bw_count; + + int reportErrors; + + struct testColl* collTest; +}; + +typedef testResult_t (*threadFunc_t)(struct threadArgs* args); +struct testThread { + pthread_t thread; + threadFunc_t func; + struct threadArgs args; + testResult_t ret; +}; + +#include + +// Provided by common.cu +extern void Barrier(struct threadArgs* args); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); +extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); + +// Provided by each coll +extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); +extern void print_header(); + +#include + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + +#include + +static uint64_t getHash(const char* string, size_t n) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (size_t c = 0; c < n; c++) { + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + +static size_t wordSize(ncclDataType_t type) { + switch(type) { + case ncclChar: +#if NCCL_MAJOR >= 2 + //case ncclInt8: + case ncclUint8: +#endif + return 1; + case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif + //case ncclFloat16: + return 2; + case ncclInt: + case ncclFloat: +#if NCCL_MAJOR >= 2 + //case ncclInt32: + case ncclUint32: + //case ncclFloat32: +#endif + return 4; + case ncclInt64: + case ncclUint64: + case ncclDouble: + //case ncclFloat64: + return 8; + default: return 0; + } +} + +extern int test_ncclVersion; // init'd with ncclGetVersion() +constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); +extern int test_opnum; +extern int test_typenum; +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[]; +extern const char *test_opnames[]; + +static int ncclstringtotype(char *str) { + for (int t=0; t +#ifndef NCCL1_COMPAT_H +#define NCCL1_COMPAT_H + +#ifndef NCCL_MAJOR // NCCL 1.x +#define NCCL_MAJOR 1 +#define NCCL_MINOR 0 + +#define ncclNumOps nccl_NUM_OPS +#define ncclNumTypes nccl_NUM_TYPES + +static ncclResult_t ncclGroupStart() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; } +static ncclResult_t ncclGroupEnd() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; } + +#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument; + +static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclBcast(buff, (int)count, datatype, root, comm, stream); +} +static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + CHECKCOUNT(recvcount); + return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(sendcount); + return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif From 34f1b12ebaf341507665f30cf1a3d0bf2baa1c76 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 20 Oct 2022 05:58:08 +0000 Subject: [PATCH 046/109] nccl manual size seems ok --- src_nccl_manual_size/all_reduce_nccl_ms.cu | 11 +- src_nccl_manual_size/common_nccl_ms.cu | 146 ++++++++------------- src_nccl_manual_size/common_nccl_ms.h | 7 +- 3 files changed, 68 insertions(+), 96 deletions(-) diff --git a/src_nccl_manual_size/all_reduce_nccl_ms.cu b/src_nccl_manual_size/all_reduce_nccl_ms.cu index 95d7b28..7bab5c2 100644 --- a/src_nccl_manual_size/all_reduce_nccl_ms.cu +++ b/src_nccl_manual_size/all_reduce_nccl_ms.cu @@ -27,6 +27,14 @@ void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par *paramcount = *sendcount; } +void AllReduceGetCollByteCountList(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen) { // listLen就等于agg_iters + // OFTEST_LOG1(TEST, "hi"); + for (int i = 0; i < listLen; i++) { + *(sendCntList + i) = *(countList + i); + *(recvCntList + i) = *(countList + i); + } +} + testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); @@ -108,7 +116,8 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t struct testEngine allReduceEngine = { AllReduceGetBuffSize, - AllReduceRunTest + AllReduceRunTest, + AllReduceGetCollByteCountList }; #pragma weak ncclTestEngine=allReduceEngine diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu index f77cd48..2a8f6ec 100644 --- a/src_nccl_manual_size/common_nccl_ms.cu +++ b/src_nccl_manual_size/common_nccl_ms.cu @@ -13,6 +13,12 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() +// TODO: 丑丑地搞个全局变量 +// size_t countList[AGG_ITERS] = {4000, 8192000}; +size_t countList[AGG_ITERS] = {4000, 8192000}; +size_t sendBytesList[AGG_ITERS]; +size_t recvBytesList[AGG_ITERS]; + #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = { ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble @@ -59,7 +65,7 @@ static size_t stepFactor = 1; static int datacheck = 1; static int warmup_iters = 5; static int iters = 20; -static int agg_iters = 1; +static int agg_iters = AGG_ITERS; static int ncclop = ncclSum; static int nccltype = ncclFloat; static int ncclroot = 0; @@ -512,10 +518,10 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) { size_t count = args->nbytes / wordSize(type); - // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange - size_t totalnbytes = max(args->sendBytes, args->expectedBytes); - size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; - size_t shift = totalnbytes * (iter % steps); + // // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); for (int i = 0; i < args->nGpus; i++) { @@ -525,8 +531,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t CUDACHECK(cudaSetDevice(cudaDev)); #endif int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - char* recvBuff = ((char*)args->recvbuffs[i]) + shift; - char* sendBuff = ((char*)args->sendbuffs[i]) + shift; + char *recvBuff = (char *)(args->recvbuffs[iter]); + char *sendBuff = (char *)(args->sendbuffs[iter]); ncclRedOp_t op; if(opIndex < ncclNumOps) { @@ -561,8 +567,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #endif TESTCHECK(args->collTest->runColl( - (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), - (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), + (void*)(sendBuff), + (void*)(recvBuff), count, type, op, root, args->comms[i], args->streams[i])); #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) @@ -590,34 +596,14 @@ testResult_t completeColl(struct threadArgs* args) { testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); - if (datacheck) { // 这里的目的应该是让测带宽跑的coll也使用非0数据。 - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); - } - - // Sync - TESTCHECK(startColl(args, type, op, root, in_place, 0)); - TESTCHECK(completeColl(args)); Barrier(args); -#if CUDART_VERSION >= 11030 - cudaGraph_t graphs[args->nGpus]; - cudaGraphExec_t graphExec[args->nGpus]; - if (cudaGraphLaunches >= 1) { - // Begin cuda graph capture - for (int i=0; inGpus; i++) { - // Thread local mode is needed for: - // - Multi-thread mode - // - P2P pre-connect - CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); - } - } -#endif - // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { + args->nbytes = sendBytesList[iter]; + args->sendBytes = args->nbytes; if (agg_iters>1) NCCLCHECK(ncclGroupStart()); for (int aiter = 0; aiter < agg_iters; aiter++) { TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); @@ -625,27 +611,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); } -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - // End cuda graph capture - for (int i=0; inGpus; i++) { - CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); - } - // Instantiate cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); - } - // Resync CPU, restart timing, launch cuda graph - Barrier(args); - start = std::chrono::high_resolution_clock::now(); - for (int l=0; lnGpus; i++) { - CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); - } - } - } -#endif - TESTCHECK(completeColl(args)); auto delta = std::chrono::high_resolution_clock::now() - start; @@ -654,16 +619,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; Allreduce(args, &deltaSec, average); -#if CUDART_VERSION >= 11030 - if (cudaGraphLaunches >= 1) { - //destroy cuda graph - for (int i=0; inGpus; i++) { - CUDACHECK(cudaGraphExecDestroy(graphExec[i])); - CUDACHECK(cudaGraphDestroy(graphs[i])); - } - } -#endif - double algBw, busBw; args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); @@ -714,29 +669,16 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { } testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { - // Warm-up for large size - setupArgs(args->maxbytes, type, args); - for (int iter = 0; iter < warmup_iters; iter++) { - TESTCHECK(startColl(args, type, op, root, 0, iter)); - } - TESTCHECK(completeColl(args)); - - // Warm-up for small size - setupArgs(args->minbytes, type, args); - for (int iter = 0; iter < warmup_iters; iter++) { - TESTCHECK(startColl(args, type, op, root, 0, iter)); - } - TESTCHECK(completeColl(args)); // Benchmark - for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { - setupArgs(size, type, args); - print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); - TESTCHECK(BenchTime(args, type, op, root, 0)); - // TODO: 实测是否恢复? - // TESTCHECK(BenchTime(args, type, op, root, 1)); - PRINT("\n"); - } + args->nbytes = sendBytesList[0]; + args->sendBytes = args->nbytes; + print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0)); + // TODO: 实测是否恢复? + // TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); + return testSuccess; } @@ -965,6 +907,12 @@ int main(int argc, char* argv[]) { return 0; } +testResult_t AllocateBuffLists(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes) { + CUDACHECK(cudaMalloc(sendbuff, sendBytes)); + CUDACHECK(cudaMalloc(recvbuff, recvBytes)); + return testSuccess; +} + testResult_t run() { int nProcs = 1, proc = 0; int localRank = 0; @@ -1035,17 +983,24 @@ testResult_t run() { MPI_Barrier(MPI_COMM_WORLD); #endif cudaStream_t streams[nGpus*nThreads]; - void* sendbuffs[nGpus*nThreads]; - void* recvbuffs[nGpus*nThreads]; + void* sendbuffs[nGpus*nThreads][AGG_ITERS]; + void* recvbuffs[nGpus*nThreads][AGG_ITERS]; void* expected[nGpus*nThreads]; - size_t sendBytes, recvBytes; + // size_t sendBytes, recvBytes; + + // ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); - ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); + ncclTestEngine.getCollByteCountList(sendBytesList, recvBytesList, countList, agg_iters); for (int i=0; i coll_id = %d, ALLOCATE sendbuff @ %p, recvbuff @ %p", i, j, sendbuffs[i][j], recvbuffs[i][j]); + } } //if parallel init is not selected, use main thread to initialize NCCL @@ -1097,8 +1052,12 @@ testResult_t run() { threads[t].args.nThreads=nThreads; threads[t].args.thread=t; threads[t].args.nGpus=nGpus; - threads[t].args.sendbuffs = sendbuffs+t*nGpus; - threads[t].args.recvbuffs = recvbuffs+t*nGpus; + // threads[t].args.sendbuffs = sendbuffs+t*nGpus; + // threads[t].args.recvbuffs = recvbuffs+t*nGpus; + for (int j = 0; j < AGG_ITERS; j++) { + threads[t].args.sendbuffs[j] = sendbuffs[t][j]; + threads[t].args.recvbuffs[j] = recvbuffs[t][j]; + } threads[t].args.expected = expected+t*nGpus; threads[t].args.ncclId = ncclId; threads[t].args.comms=comms+t*nGpus; @@ -1146,9 +1105,10 @@ testResult_t run() { // Free off CUDA allocated memory for (int i=0; i #include "nccl1_compat.h" +#define AGG_ITERS 2 + #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ @@ -83,6 +85,7 @@ struct testEngine { void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName); + void (*getCollByteCountList)(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen); }; extern struct testEngine ncclTestEngine; @@ -100,10 +103,10 @@ struct threadArgs { int thread; int nGpus; int localRank; - void** sendbuffs; + void* sendbuffs[AGG_ITERS]; size_t sendBytes; size_t sendInplaceOffset; - void** recvbuffs; + void* recvbuffs[AGG_ITERS]; size_t recvInplaceOffset; ncclUniqueId ncclId; ncclComm_t* comms; From c84dd891d6ffe1c68a08ee3336b7433eb3a1cd77 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 20 Oct 2022 06:10:42 +0000 Subject: [PATCH 047/109] fix manual size bug --- src_nccl_manual_size/common_nccl_ms.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu index 2a8f6ec..8247baa 100644 --- a/src_nccl_manual_size/common_nccl_ms.cu +++ b/src_nccl_manual_size/common_nccl_ms.cu @@ -602,10 +602,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { - args->nbytes = sendBytesList[iter]; - args->sendBytes = args->nbytes; if (agg_iters>1) NCCLCHECK(ncclGroupStart()); for (int aiter = 0; aiter < agg_iters; aiter++) { + args->nbytes = sendBytesList[aiter]; + args->sendBytes = args->nbytes; TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); } if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); From 93668dc5db820d6a0aad51854d319eccd699f339 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 20 Oct 2022 06:33:13 +0000 Subject: [PATCH 048/109] non-homogeneous nccl manual size --- src_nccl_manual_size/common_nccl_ms.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu index 8247baa..3867e40 100644 --- a/src_nccl_manual_size/common_nccl_ms.cu +++ b/src_nccl_manual_size/common_nccl_ms.cu @@ -18,6 +18,8 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() size_t countList[AGG_ITERS] = {4000, 8192000}; size_t sendBytesList[AGG_ITERS]; size_t recvBytesList[AGG_ITERS]; +// ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat}; +ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat}; #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = { @@ -606,7 +608,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t for (int aiter = 0; aiter < agg_iters; aiter++) { args->nbytes = sendBytesList[aiter]; args->sendBytes = args->nbytes; - TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); + TESTCHECK(startColl(args, typeList[aiter], op, root, in_place, iter*agg_iters+aiter)); } if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); } From 71b40c7cbc21ea47e195187301faf0ee3c1f9da1 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 22 Oct 2022 12:34:19 +0000 Subject: [PATCH 049/109] + cudadev in cbArgs for ofccl manual size --- src_manual_size/common_ms.h | 1 + src_manual_size/ofccl_all_reduce_ms.cu | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h index c9a477d..6b4285e 100644 --- a/src_manual_size/common_ms.h +++ b/src_manual_size/common_ms.h @@ -71,6 +71,7 @@ typedef enum { typedef struct { int collId; int gotCqe; + int cudaDev; pthread_mutex_t mutex; } CallBackArgs; diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu index 2d925f3..13ecc93 100644 --- a/src_manual_size/ofccl_all_reduce_ms.cu +++ b/src_manual_size/ofccl_all_reduce_ms.cu @@ -84,9 +84,12 @@ int myCallback(int collIdFromCqe, void *args) { ((CallBackArgs *)args)->gotCqe = 1; pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); // 这个函数之后在poller线程里调用的,所以这个获得的dev应该是不对的。 + int collId = ((CallBackArgs *)args)->collId; + int cudaDev = ((CallBackArgs *)args)->cudaDev; + OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); return 0; } @@ -95,13 +98,13 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); - // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); args->collId = collId; args->gotCqe = 0; + args->cudaDev = cudaDev; pthread_mutex_init(&args->mutex, NULL); NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d", pthread_self(), cudaDev, collId); // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); return testSuccess; From f2b285d3faa6fde7fe746847f09651a503df6771 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 24 Oct 2022 02:31:16 +0000 Subject: [PATCH 050/109] 161 maunal size from resnet --- src_manual_size/common_ms.cu | 2 +- src_manual_size/common_ms.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index 08687bb..d81e02a 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -17,7 +17,7 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() // TODO: 丑丑地搞个全局变量 // size_t countList[MULTI_ITERS] = {4000, 8192000}; -size_t countList[MULTI_ITERS] = {4000, 8192000}; +size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632}; size_t sendBytesList[MULTI_ITERS]; size_t recvBytesList[MULTI_ITERS]; diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h index 6b4285e..04332a8 100644 --- a/src_manual_size/common_ms.h +++ b/src_manual_size/common_ms.h @@ -18,7 +18,8 @@ // #define DEBUG_PRINT 1 -#define MULTI_ITERS 2 +// #define MULTI_ITERS 2 +#define MULTI_ITERS 161 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) From a32587b25d86a5c3f4df86d557d9816913c9c9cb Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 24 Oct 2022 02:45:38 +0000 Subject: [PATCH 051/109] accurate damie --- src_manual_size/common_ms.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index d81e02a..a361657 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -1240,7 +1240,7 @@ testResult_t run() { int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); - if (multi_iters != 2) { + if (multi_iters != MULTI_ITERS) { // TODO: he is only a baby T^T OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters); } From 5d07bca66f6aab048068220ab765376feb36d067 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 24 Oct 2022 02:48:26 +0000 Subject: [PATCH 052/109] . --- src_manual_size/common_ms.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index a361657..6789d9f 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -1241,8 +1241,7 @@ testResult_t run() { int cudaDev; CUDACHECK(cudaGetDevice(&cudaDev)); if (multi_iters != MULTI_ITERS) { - // TODO: he is only a baby T^T - OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters); + OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters); } OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters); #define MAX_LINE 2048 From 4f06775461b641f780739ca67695546d7c9d97a7 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 31 Oct 2022 11:41:08 +0000 Subject: [PATCH 053/109] aggressive no sync --- src_manual_size/common_ms.cu | 7 +++++-- src_manual_size/common_ms.h | 1 + src_manual_size/ofccl_all_reduce_ms.cu | 10 +++++----- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index 6789d9f..7f4e36c 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -17,7 +17,8 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() // TODO: 丑丑地搞个全局变量 // size_t countList[MULTI_ITERS] = {4000, 8192000}; -size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632}; +// size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632}; +size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096}; size_t sendBytesList[MULTI_ITERS]; size_t recvBytesList[MULTI_ITERS]; @@ -818,13 +819,15 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t for (int iter = 0; iter < iters; iter++) { for (int miter = 0; miter < multi_iters; miter++) { - seenCqe[miter] = 0; + seenCqe[miter] = 0; // TODO: 这样的写法或许不能保证“同步”,即现在的161个都跑完,才去启动下一波161个。 TESTCHECK(startColl(args, type, op, root, in_place, iter * multi_iters + miter, miter, rankCtx)); } TESTCHECK(completeColl(args)); + // usleep(100000); + int cudaDev; cudaGetDevice(&cudaDev); OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h index 04332a8..c780398 100644 --- a/src_manual_size/common_ms.h +++ b/src_manual_size/common_ms.h @@ -10,6 +10,7 @@ #include #include #include +#include // usleep #ifdef MPI_SUPPORT #include "mpi.h" #endif diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu index 13ecc93..74f4866 100644 --- a/src_manual_size/ofccl_all_reduce_ms.cu +++ b/src_manual_size/ofccl_all_reduce_ms.cu @@ -87,10 +87,10 @@ int myCallback(int collIdFromCqe, void *args) { // int cudaDev; // CUDACHECK(cudaGetDevice(&cudaDev)); // 这个函数之后在poller线程里调用的,所以这个获得的dev应该是不对的。 - int collId = ((CallBackArgs *)args)->collId; - int cudaDev = ((CallBackArgs *)args)->cudaDev; - - OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); + // int collId = ((CallBackArgs *)args)->collId; + // int cudaDev = ((CallBackArgs *)args)->cudaDev; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); + return 0; } @@ -104,7 +104,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa pthread_mutex_init(&args->mutex, NULL); NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); - OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d", pthread_self(), cudaDev, collId); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d", pthread_self(), cudaDev, collId); // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); return testSuccess; From 40fbb707108035832a058cd72925ba2ac58a9ed5 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 7 Nov 2022 14:45:58 +0000 Subject: [PATCH 054/109] a new permutation from oneflow --- src_manual_size/common_ms.cu | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index 7f4e36c..2fa7bda 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -19,6 +19,11 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() // size_t countList[MULTI_ITERS] = {4000, 8192000}; // size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632}; size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096}; +int idxList[2][MULTI_ITERS] = { + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + } +}; size_t sendBytesList[MULTI_ITERS]; size_t recvBytesList[MULTI_ITERS]; @@ -810,6 +815,9 @@ testResult_t completeColl(struct threadArgs *args) { testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) { + int cudaDev; + cudaGetDevice(&cudaDev); + size_t count = args->nbytes / wordSize(type); Barrier(args); @@ -817,8 +825,11 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { - - for (int miter = 0; miter < multi_iters; miter++) { + // 在这个地方改变miter的遍历顺序,起到乱序调用的作用。 + for (int miter_idx = 0; miter_idx < multi_iters; miter_idx++) { + int miter = idxList[cudaDev][miter_idx]; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke %dth startColl iter for coll_id = %d", pthread_self(), cudaDev, iter, miter); + // for (int miter = 0; miter < multi_iters; miter++) { seenCqe[miter] = 0; // TODO: 这样的写法或许不能保证“同步”,即现在的161个都跑完,才去启动下一波161个。 TESTCHECK(startColl(args, type, op, root, in_place, iter * multi_iters + miter, miter, rankCtx)); @@ -827,9 +838,6 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t TESTCHECK(completeColl(args)); // usleep(100000); - - int cudaDev; - cudaGetDevice(&cudaDev); OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); } From e98b271cb988232762d99b4f96c31d620715eabd Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 12 Nov 2022 14:12:31 +0000 Subject: [PATCH 055/109] log --- src_manual_size/common_ms.cu | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index 2fa7bda..66f5bfc 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -785,7 +785,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type, return testSuccess; } -testResult_t completeColl(struct threadArgs *args) { +testResult_t completeColl(struct threadArgs *args, int iter=0) { if (blocking_coll) return testSuccess; @@ -799,10 +799,10 @@ testResult_t completeColl(struct threadArgs *args) { gotCqeCnt++; seenCqe[i] = 1; - // int cudaDev; - // CUDACHECK(cudaGetDevice(&cudaDev)); + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); // if (cudaDev == 0) { - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i); + OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get %dth cqe for coll_id = %d", pthread_self(), cudaDev, iter, i); // } } @@ -824,18 +824,17 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); - for (int iter = 0; iter < iters; iter++) { + for (int iter = 1; iter <= iters; iter++) { // 在这个地方改变miter的遍历顺序,起到乱序调用的作用。 - for (int miter_idx = 0; miter_idx < multi_iters; miter_idx++) { + for (int miter_idx = 0; miter_idx < multi_iters; miter_idx++) { // for (int miter = 0; miter < multi_iters; miter++) { int miter = idxList[cudaDev][miter_idx]; // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke %dth startColl iter for coll_id = %d", pthread_self(), cudaDev, iter, miter); - // for (int miter = 0; miter < multi_iters; miter++) { - seenCqe[miter] = 0; // TODO: 这样的写法或许不能保证“同步”,即现在的161个都跑完,才去启动下一波161个。 + seenCqe[miter] = 0; TESTCHECK(startColl(args, type, op, root, in_place, iter * multi_iters + miter, miter, rankCtx)); } - TESTCHECK(completeColl(args)); + TESTCHECK(completeColl(args, iter)); // usleep(100000); OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); From 3598de42295060710109a751f69cde0e69747ab9 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 14 Nov 2022 02:52:01 +0000 Subject: [PATCH 056/109] suit 8 cards --- src_manual_size/common_ms.cu | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index 66f5bfc..1243e38 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -19,7 +19,16 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() // size_t countList[MULTI_ITERS] = {4000, 8192000}; // size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632}; size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096}; -int idxList[2][MULTI_ITERS] = { +int idxList[8][MULTI_ITERS] = { + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 } @@ -799,11 +808,9 @@ testResult_t completeColl(struct threadArgs *args, int iter=0) { gotCqeCnt++; seenCqe[i] = 1; - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - // if (cudaDev == 0) { - OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get %dth cqe for coll_id = %d", pthread_self(), cudaDev, iter, i); - // } + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get %dth cqe for coll_id = %d", pthread_self(), cudaDev, iter, i); } } From bcf3b874b1a879079e21c6d77a089b1eab57667d Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 18 Nov 2022 05:21:49 +0000 Subject: [PATCH 057/109] use prepareDone --- src_simple/common_simple.cu | 10 +++++----- src_simple/ofccl_all_reduce.cu | 11 ++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 42cbe1c..4bc04bb 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -814,9 +814,9 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t TESTCHECK(completeColl(args)); - int cudaDev; - cudaGetDevice(&cudaDev); - OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + // int cudaDev; + // cudaGetDevice(&cudaDev); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); } auto delta = std::chrono::high_resolution_clock::now() - start; @@ -912,8 +912,8 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev); } - // ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 - ofcclFinalizeRankCtx7StartHostThrds(rankCtx); + ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 + // ofcclFinalizeRankCtx7StartHostThrds(rankCtx); // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 // warmup还是需要开,不然ofccl性能拉胯。 diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 42c9628..50aaad8 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -36,9 +36,6 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; CUDACHECK(cudaSetDevice(gpuid)); @@ -49,6 +46,9 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); CUDACHECK(cudaDeviceSynchronize()); } + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev); return testSuccess; } @@ -80,8 +80,6 @@ int myCallback(int collIdFromCqe, void *args) { } testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); args->collId = collId; @@ -89,6 +87,9 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa pthread_mutex_init(&args->mutex, NULL); NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); From 0ccfcc996b368407a8ba7eb647f3a8d54d98bdae Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 20 Nov 2022 13:36:00 +0000 Subject: [PATCH 058/109] nccl ms different order --- src_nccl_manual_size/common_nccl_ms.cu | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu index 3867e40..e4449e1 100644 --- a/src_nccl_manual_size/common_nccl_ms.cu +++ b/src_nccl_manual_size/common_nccl_ms.cu @@ -20,6 +20,10 @@ size_t sendBytesList[AGG_ITERS]; size_t recvBytesList[AGG_ITERS]; // ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat}; ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat}; +int idxList[8][AGG_ITERS] = { + {0, 1}, + {1, 0} +}; #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = { @@ -598,6 +602,8 @@ testResult_t completeColl(struct threadArgs* args) { testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); + int cudaDev; + cudaGetDevice(&cudaDev); Barrier(args); @@ -605,7 +611,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { if (agg_iters>1) NCCLCHECK(ncclGroupStart()); - for (int aiter = 0; aiter < agg_iters; aiter++) { + // for (int aiter = 0; aiter < agg_iters; aiter++) { + for (int aiter_idx = 0; aiter_idx < agg_iters; aiter_idx++) { + int aiter = idxList[cudaDev][aiter_idx]; args->nbytes = sendBytesList[aiter]; args->sendBytes = args->nbytes; TESTCHECK(startColl(args, typeList[aiter], op, root, in_place, iter*agg_iters+aiter)); From 2b19a59a9e4a284adf1ed9de0d8e24c63fe251a7 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 26 Nov 2022 05:34:52 +0000 Subject: [PATCH 059/109] usleep --- src_manual_size/common_ms.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index 1243e38..f8ad9a1 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -837,13 +837,14 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t int miter = idxList[cudaDev][miter_idx]; // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke %dth startColl iter for coll_id = %d", pthread_self(), cudaDev, iter, miter); seenCqe[miter] = 0; + usleep(200); TESTCHECK(startColl(args, type, op, root, in_place, iter * multi_iters + miter, miter, rankCtx)); } TESTCHECK(completeColl(args, iter)); - // usleep(100000); + usleep(100000); OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); } From b3b632348a9ce03339cb5e548b5b12e6800c50f9 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 26 Nov 2022 14:02:10 +0000 Subject: [PATCH 060/109] + ofccl_test.sh --- ofccl_test.sh | 60 ++++++++++++++++++++++++++++++++++++ src_manual_size/common_ms.cu | 2 +- 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 ofccl_test.sh diff --git a/ofccl_test.sh b/ofccl_test.sh new file mode 100644 index 0000000..f1930be --- /dev/null +++ b/ofccl_test.sh @@ -0,0 +1,60 @@ +clear + +cd /home/panlichen/work2/nccl-tests +export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 +export MY_NUM_DEV=2 +# export CUDA_VISIBLE_DEVICES=0,1,4,5 +export SHOW_ALL_PREPARED_COLL=0 +export NITER=4 +export NBYTES=8K +export WARMITER=2 +export MITER=4 + +export TRAVERSE_TIMES=10 +export TOLERANT_FAIL_CHECK_SQ_CNT=500 +export CNT_BEFORE_QUIT=5 +export TOLERANT_UNPROGRESSED_CNT=50000 +export BASE_CTX_SWITCH_THRESHOLD=100 + +echo TRAVERSE_TIMES=$TRAVERSE_TIMES +echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT +echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD + +if [ -z $BINARY ];then + BINARY="NORMAL" + BINARY="MS" +fi + +if [ "$BINARY" == "NORMAL" ];then + target="./build/ofccl_all_reduce_perf" +elif [ "$BINARY" == "MS" ];then + target="./build/ofccl_all_reduce_ms_perf" + export NITER=200 + export MY_NUM_DEV=8 + export SHOW_ALL_PREPARED_COLL=1 + export WARMITER=0 +fi + + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c 0 -M $MITER" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="cuda-gdb $target" +elif [ "$RUN_TYPE" == "NSYS" ];then + cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b 64M -e 64M -f 2 -t $MY_NUM_DEV -g 1 -n 1 -w 0 -c 0" +fi + +echo cmd=$cmd +$cmd + diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index f8ad9a1..e54bb30 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -1129,7 +1129,7 @@ int main(int argc, char *argv[]) { iters = (int)strtol(optarg, NULL, 0); break; case 'M': - multi_iters = (int)strtol(optarg, NULL, 0); + // multi_iters = (int)strtol(optarg, NULL, 0); break; case 'm': #if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) From e2bfe2e80b6293ee7f50cfa0910e4e0069db91db Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 29 Nov 2022 16:55:20 +0000 Subject: [PATCH 061/109] scripts --- ofccl_test.sh | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index f1930be..c9c898d 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -7,19 +7,14 @@ export NCCL_ALGO=Ring # export NCCL_MAX_NCHANNELS=1 # export NCCL_MIN_NCHANNELS=1 # export NCCL_NTHREADS=64 -export MY_NUM_DEV=2 -# export CUDA_VISIBLE_DEVICES=0,1,4,5 -export SHOW_ALL_PREPARED_COLL=0 -export NITER=4 -export NBYTES=8K -export WARMITER=2 -export MITER=4 + +export CHECK=0 export TRAVERSE_TIMES=10 -export TOLERANT_FAIL_CHECK_SQ_CNT=500 +export TOLERANT_FAIL_CHECK_SQ_CNT=5000 export CNT_BEFORE_QUIT=5 export TOLERANT_UNPROGRESSED_CNT=50000 -export BASE_CTX_SWITCH_THRESHOLD=100 +export BASE_CTX_SWITCH_THRESHOLD=80 echo TRAVERSE_TIMES=$TRAVERSE_TIMES echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT @@ -28,18 +23,38 @@ echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD if [ -z $BINARY ];then - BINARY="NORMAL" + BINARY="DEBUG" BINARY="MS" + BINARY="PERF" fi -if [ "$BINARY" == "NORMAL" ];then +if [ "$BINARY" == "DEBUG" ];then + target="./build/ofccl_all_reduce_perf" + export MY_NUM_DEV=8 + # export CUDA_VISIBLE_DEVICES=0,1,4,5 + export SHOW_ALL_PREPARED_COLL=1 + export NITER=4 + export NBYTES=8K + export WARMITER=2 + export MITER=4 +elif [ "$BINARY" == "PERF" ];then target="./build/ofccl_all_reduce_perf" + export MY_NUM_DEV=2 + export CUDA_VISIBLE_DEVICES=0,1,4,5 + export SHOW_ALL_PREPARED_COLL=0 + export NITER=4 + export NBYTES=8K + export WARMITER=2 + export MITER=4 elif [ "$BINARY" == "MS" ];then target="./build/ofccl_all_reduce_ms_perf" - export NITER=200 export MY_NUM_DEV=8 + # export CUDA_VISIBLE_DEVICES=0,1,4,5 + export NITER=200 export SHOW_ALL_PREPARED_COLL=1 export WARMITER=0 + export NBYTES=8K + export MITER=4 fi @@ -48,7 +63,7 @@ if [ -z $RUN_TYPE ];then fi if [ "$RUN_TYPE" == "PURE" ];then - cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c 0 -M $MITER" + cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" elif [ "$RUN_TYPE" == "GDB" ];then cmd="cuda-gdb $target" elif [ "$RUN_TYPE" == "NSYS" ];then @@ -56,5 +71,5 @@ elif [ "$RUN_TYPE" == "NSYS" ];then fi echo cmd=$cmd -$cmd +$cmd #> /home/panlichen/work2/ofccl/log/ofccl.log From 34cd2754d2ef06c57e648b3835e784007b2c837e Mon Sep 17 00:00:00 2001 From: Panlichen Date: Wed, 30 Nov 2022 06:10:33 +0000 Subject: [PATCH 062/109] script --- ofccl_test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index c9c898d..9dd6ff5 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -39,8 +39,8 @@ if [ "$BINARY" == "DEBUG" ];then export MITER=4 elif [ "$BINARY" == "PERF" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=2 - export CUDA_VISIBLE_DEVICES=0,1,4,5 + export MY_NUM_DEV=8 + # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=0 export NITER=4 export NBYTES=8K From 4e8162026d8485260dd4d2218a09828d1c668936 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 1 Dec 2022 15:38:33 +0000 Subject: [PATCH 063/109] scripts --- ofccl_test.sh | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 9dd6ff5..7cdf163 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -11,21 +11,26 @@ export NCCL_ALGO=Ring export CHECK=0 export TRAVERSE_TIMES=10 -export TOLERANT_FAIL_CHECK_SQ_CNT=5000 -export CNT_BEFORE_QUIT=5 -export TOLERANT_UNPROGRESSED_CNT=50000 +export TOLERANT_UNPROGRESSED_CNT=8000 export BASE_CTX_SWITCH_THRESHOLD=80 +# export ENABLE_VQ=1 +# export TOLERANT_FAIL_CHECK_SQ_CNT=5000 +# export CNT_BEFORE_QUIT=5 + echo TRAVERSE_TIMES=$TRAVERSE_TIMES -echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT -echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +if [ ! -z $BINARY ];then + echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT + echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT +fi + if [ -z $BINARY ];then BINARY="DEBUG" BINARY="MS" - BINARY="PERF" + # BINARY="PERF" fi if [ "$BINARY" == "DEBUG" ];then From 0c3718e300f0c4e3108b14bc131c66671ba2f657 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 5 Dec 2022 08:05:38 +0000 Subject: [PATCH 064/109] scripts --- ofccl_test.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 7cdf163..b0384ee 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -4,15 +4,16 @@ cd /home/panlichen/work2/nccl-tests export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib export NCCL_PROTO=Simple export NCCL_ALGO=Ring -# export NCCL_MAX_NCHANNELS=1 -# export NCCL_MIN_NCHANNELS=1 -# export NCCL_NTHREADS=64 +export NCCL_MAX_NCHANNELS=1 +export NCCL_MIN_NCHANNELS=1 +export NCCL_NTHREADS=64 export CHECK=0 export TRAVERSE_TIMES=10 -export TOLERANT_UNPROGRESSED_CNT=8000 +export TOLERANT_UNPROGRESSED_CNT=10000 export BASE_CTX_SWITCH_THRESHOLD=80 +export BOUNS_SWITCH_4_PROCESSED_COLL=100 # export ENABLE_VQ=1 # export TOLERANT_FAIL_CHECK_SQ_CNT=5000 @@ -21,6 +22,7 @@ export BASE_CTX_SWITCH_THRESHOLD=80 echo TRAVERSE_TIMES=$TRAVERSE_TIMES echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL if [ ! -z $BINARY ];then echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT @@ -39,7 +41,7 @@ if [ "$BINARY" == "DEBUG" ];then # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=1 export NITER=4 - export NBYTES=8K + export NBYTES=1G export WARMITER=2 export MITER=4 elif [ "$BINARY" == "PERF" ];then From dba5947486b454affa89e37149f6bf01d75f3250 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Wed, 7 Dec 2022 06:00:07 +0000 Subject: [PATCH 065/109] scripts --- ofccl_test.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index b0384ee..b0b3452 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -1,12 +1,15 @@ clear +export DEBUG_CC=0 +export DEBUG_ENQ=0 + cd /home/panlichen/work2/nccl-tests export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib export NCCL_PROTO=Simple export NCCL_ALGO=Ring -export NCCL_MAX_NCHANNELS=1 -export NCCL_MIN_NCHANNELS=1 -export NCCL_NTHREADS=64 +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 export CHECK=0 @@ -40,10 +43,11 @@ if [ "$BINARY" == "DEBUG" ];then export MY_NUM_DEV=8 # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=1 - export NITER=4 + export NITER=10 export NBYTES=1G - export WARMITER=2 + export WARMITER=0 export MITER=4 + export CHECK=0 elif [ "$BINARY" == "PERF" ];then target="./build/ofccl_all_reduce_perf" export MY_NUM_DEV=8 @@ -67,12 +71,14 @@ fi if [ -z $RUN_TYPE ];then RUN_TYPE="PURE" + # RUN_TYPE="GDB" fi if [ "$RUN_TYPE" == "PURE" ];then cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" elif [ "$RUN_TYPE" == "GDB" ];then cmd="cuda-gdb $target" + # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 elif [ "$RUN_TYPE" == "NSYS" ];then cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b 64M -e 64M -f 2 -t $MY_NUM_DEV -g 1 -n 1 -w 0 -c 0" fi From 10fefc6110ce68e96d0158196577810df2382c2b Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 8 Dec 2022 08:34:36 +0000 Subject: [PATCH 066/109] scripts --- ofccl_test.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index b0b3452..e66c4e5 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -16,7 +16,8 @@ export CHECK=0 export TRAVERSE_TIMES=10 export TOLERANT_UNPROGRESSED_CNT=10000 export BASE_CTX_SWITCH_THRESHOLD=80 -export BOUNS_SWITCH_4_PROCESSED_COLL=100 +export BOUNS_SWITCH_4_PROCESSED_COLL=0 +export DEV_TRY_ROUND=10 # export ENABLE_VQ=1 # export TOLERANT_FAIL_CHECK_SQ_CNT=5000 @@ -26,6 +27,7 @@ echo TRAVERSE_TIMES=$TRAVERSE_TIMES echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL +echo DEV_TRY_ROUND=$DEV_TRY_ROUND if [ ! -z $BINARY ];then echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT @@ -40,13 +42,13 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=8 + export MY_NUM_DEV=2 # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=1 - export NITER=10 - export NBYTES=1G + export NITER=1 + export NBYTES=1M export WARMITER=0 - export MITER=4 + export MITER=1 export CHECK=0 elif [ "$BINARY" == "PERF" ];then target="./build/ofccl_all_reduce_perf" From 7a79d985070a2e9c5c170f53fc79cd2a3bc5d7d4 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 9 Dec 2022 16:37:18 +0000 Subject: [PATCH 067/109] little ms --- ofccl_test.sh | 20 ++++----- src_manual_size/common_ms.cu | 80 ++++++++++++++++++++++++++++-------- src_manual_size/common_ms.h | 10 ++++- 3 files changed, 80 insertions(+), 30 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index e66c4e5..68cc75b 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -7,9 +7,9 @@ cd /home/panlichen/work2/nccl-tests export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib export NCCL_PROTO=Simple export NCCL_ALGO=Ring -# export NCCL_MAX_NCHANNELS=1 -# export NCCL_MIN_NCHANNELS=1 -# export NCCL_NTHREADS=64 +export NCCL_MAX_NCHANNELS=1 +export NCCL_MIN_NCHANNELS=1 +export NCCL_NTHREADS=64 export CHECK=0 @@ -36,19 +36,19 @@ fi if [ -z $BINARY ];then BINARY="DEBUG" - BINARY="MS" + # BINARY="MS" # BINARY="PERF" fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=2 + export MY_NUM_DEV=8 # export CUDA_VISIBLE_DEVICES=0,1,4,5 - export SHOW_ALL_PREPARED_COLL=1 - export NITER=1 - export NBYTES=1M + export SHOW_ALL_PREPARED_COLL=0 + export NITER=40 + export NBYTES=128M export WARMITER=0 - export MITER=1 + export MITER=2 export CHECK=0 elif [ "$BINARY" == "PERF" ];then target="./build/ofccl_all_reduce_perf" @@ -86,5 +86,5 @@ elif [ "$RUN_TYPE" == "NSYS" ];then fi echo cmd=$cmd -$cmd #> /home/panlichen/work2/ofccl/log/ofccl.log +$cmd > /home/panlichen/work2/ofccl/log/ofccl.log diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index e54bb30..0ed1041 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -15,24 +15,68 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() -// TODO: 丑丑地搞个全局变量 -// size_t countList[MULTI_ITERS] = {4000, 8192000}; -// size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632}; -size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096}; -int idxList[8][MULTI_ITERS] = { - {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, - {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 - }, - {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, - {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 - }, - {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, - {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 - }, - {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, - {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 - } -}; +#ifdef FULL_MS + size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096}; + int idxList[8][MULTI_ITERS] = { + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + } + }; +#else + // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512}; + // size_t idxList[8][MULTI_ITERS] = { + // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + // {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15}, + // {4, 5, 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + // {0, 1, 2, 3, 8, 4, 5, 6, 9, 10, 11, 7, 12, 13, 14, 15}, + // {0, 1, 2, 3, 8, 4, 5, 6, 9, 10, 11, 7, 12, 13, 14, 15}, + // {4, 2, 3, 6, 7, 8, 5, 0, 1, 9, 10, 11, 12, 13, 14, 15}, + // {4, 2, 3, 1, 9, 10, 11, 6, 7, 8, 5, 0, 12, 13, 14, 15}, + // {4, 2, 3, 1, 9, 5, 0, 12, 13, 14, 10, 11, 6, 7, 8, 15} + // // {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0} + // }; + + // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 1048576}; + // size_t idxList[8][MULTI_ITERS] = { + // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + // {0, 2, 1, 3, 5, 4, 6, 9, 8, 7}, + // {3, 2, 5, 6, 4, 7, 1, 9, 8, 0}, + // {1, 2, 4, 5, 7, 6, 8, 9, 3, 0}, + // {2, 0, 5, 7, 4, 8, 9, 6, 3, 1}, + // {3, 4, 8, 2, 1, 0, 5, 7, 9, 6}, + // {1, 3, 9, 2, 4, 7, 8, 0, 5, 6}, + // {2, 6, 8, 1, 3, 0, 4, 5, 7, 9} + // }; + size_t countList[MULTI_ITERS] = {256, 147456}; + size_t idxList[8][MULTI_ITERS] = { + {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1} + + {1, 0}, + {1, 0}, + {0, 1}, + {1, 0}, + {0, 1}, + {1, 0}, + {0, 1} + }; +#endif + size_t sendBytesList[MULTI_ITERS]; size_t recvBytesList[MULTI_ITERS]; diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h index c780398..3da8981 100644 --- a/src_manual_size/common_ms.h +++ b/src_manual_size/common_ms.h @@ -19,8 +19,14 @@ // #define DEBUG_PRINT 1 -// #define MULTI_ITERS 2 -#define MULTI_ITERS 161 +// #define FULL_MS 1 + +#ifdef FULL_MS + #define MULTI_ITERS 161 +#else + // #define MULTI_ITERS 16 + #define MULTI_ITERS 2 +#endif #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) From 7b37ceae7c4d1e2a19b965831b1b56760e0d0530 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 19 Dec 2022 11:17:25 +0000 Subject: [PATCH 068/109] + nccl_test.sh --- nccl_test.sh | 71 +++++++++++++++++++++++++++++++++++++ ofccl_test.sh | 25 ++++++++----- src_manual_size/common_ms.h | 2 +- 3 files changed, 88 insertions(+), 10 deletions(-) create mode 100644 nccl_test.sh diff --git a/nccl_test.sh b/nccl_test.sh new file mode 100644 index 0000000..89c241a --- /dev/null +++ b/nccl_test.sh @@ -0,0 +1,71 @@ +clear + +cd /home/panlichen/work2/nccl-tests +export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $BINARY ];then + # BINARY="DEBUG" + # BINARY="MS" + BINARY="PERF" +fi + +if [ "$BINARY" == "DEBUG" ];then + export MY_NUM_DEV=8 + # target="./build/ofccl_all_reduce_perf" + # # export CUDA_VISIBLE_DEVICES=0,1,4,5 + # export SHOW_ALL_PREPARED_COLL=0 + # export NITER=40 + # export NBYTES=128M + # export WARMITER=0 + # export MITER=2 + # export CHECK=0 +elif [ "$BINARY" == "PERF" ];then + target="./build/all_reduce_perf" + export MY_NUM_DEV=8 + # export CUDA_VISIBLE_DEVICES=0,1,4,5 + export SHOW_ALL_PREPARED_COLL=0 + export NITER=4 + export NBYTES=8K + export WARMITER=2 + export MITER=4 +elif [ "$BINARY" == "MS" ];then + export MY_NUM_DEV=8 + # target="./build/ofccl_all_reduce_ms_perf" + # # export CUDA_VISIBLE_DEVICES=0,1,4,5 + # export NITER=200 + # export SHOW_ALL_PREPARED_COLL=1 + # export WARMITER=0 + # export NBYTES=8K + # export MITER=4 +fi + +export NSYS_FILE="nccl" +export NCU_FILE="nccl" + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" + # RUN_TYPE="NCU" +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="cuda-gdb $target" + # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 +elif [ "$RUN_TYPE" == "NSYS" ];then + cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" +elif [ "$RUN_TYPE" == "NCU" ];then + # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" + cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" +fi + +echo cmd=$cmd +$cmd #> /home/panlichen/work2/ofccl/log/ofccl-2ms-coll-master.log + diff --git a/ofccl_test.sh b/ofccl_test.sh index 68cc75b..881354d 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -7,9 +7,9 @@ cd /home/panlichen/work2/nccl-tests export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib export NCCL_PROTO=Simple export NCCL_ALGO=Ring -export NCCL_MAX_NCHANNELS=1 -export NCCL_MIN_NCHANNELS=1 -export NCCL_NTHREADS=64 +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 export CHECK=0 @@ -19,7 +19,7 @@ export BASE_CTX_SWITCH_THRESHOLD=80 export BOUNS_SWITCH_4_PROCESSED_COLL=0 export DEV_TRY_ROUND=10 -# export ENABLE_VQ=1 +# export ENABLE_VQ=1 # volunteer quit # export TOLERANT_FAIL_CHECK_SQ_CNT=5000 # export CNT_BEFORE_QUIT=5 @@ -29,15 +29,15 @@ echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL echo DEV_TRY_ROUND=$DEV_TRY_ROUND -if [ ! -z $BINARY ];then +if [ ! -z $ENABLE_VQ ];then echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT fi if [ -z $BINARY ];then BINARY="DEBUG" - # BINARY="MS" - # BINARY="PERF" + BINARY="MS" + BINARY="PERF" fi if [ "$BINARY" == "DEBUG" ];then @@ -70,10 +70,14 @@ elif [ "$BINARY" == "MS" ];then export MITER=4 fi +export NSYS_FILE="ofccl" +export NCU_FILE="ofccl" if [ -z $RUN_TYPE ];then RUN_TYPE="PURE" # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" + # RUN_TYPE="NCU" fi if [ "$RUN_TYPE" == "PURE" ];then @@ -82,9 +86,12 @@ elif [ "$RUN_TYPE" == "GDB" ];then cmd="cuda-gdb $target" # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 elif [ "$RUN_TYPE" == "NSYS" ];then - cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b 64M -e 64M -f 2 -t $MY_NUM_DEV -g 1 -n 1 -w 0 -c 0" + cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" +elif [ "$RUN_TYPE" == "NCU" ];then + # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" + cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" fi echo cmd=$cmd -$cmd > /home/panlichen/work2/ofccl/log/ofccl.log +$cmd #> /home/panlichen/work2/ofccl/log/ofccl-2ms-coll-master.log diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h index 3da8981..1785efe 100644 --- a/src_manual_size/common_ms.h +++ b/src_manual_size/common_ms.h @@ -19,7 +19,7 @@ // #define DEBUG_PRINT 1 -// #define FULL_MS 1 +#define FULL_MS 1 #ifdef FULL_MS #define MULTI_ITERS 161 From 5bb88a119a17069d327c5f874f1b18741e3a39db Mon Sep 17 00:00:00 2001 From: Panlichen Date: Mon, 19 Dec 2022 13:12:35 +0000 Subject: [PATCH 069/109] fix bug in nccl_tests.sh --- nccl_test.sh | 9 ++++--- ofccl_test.sh | 2 ++ src/common.cu | 50 +++++++++++++++++++++++++++++++++++-- src/common.h | 2 ++ src_simple/common_simple.cu | 6 ++++- 5 files changed, 62 insertions(+), 7 deletions(-) diff --git a/nccl_test.sh b/nccl_test.sh index 89c241a..2243904 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -33,6 +33,7 @@ elif [ "$BINARY" == "PERF" ];then export NBYTES=8K export WARMITER=2 export MITER=4 + export CHECK=0 elif [ "$BINARY" == "MS" ];then export MY_NUM_DEV=8 # target="./build/ofccl_all_reduce_ms_perf" @@ -55,15 +56,15 @@ if [ -z $RUN_TYPE ];then fi if [ "$RUN_TYPE" == "PURE" ];then - cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" + cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER" elif [ "$RUN_TYPE" == "GDB" ];then cmd="cuda-gdb $target" # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 elif [ "$RUN_TYPE" == "NSYS" ];then - cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" + cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER" elif [ "$RUN_TYPE" == "NCU" ];then - # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" - cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" + # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER" + cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER" fi echo cmd=$cmd diff --git a/ofccl_test.sh b/ofccl_test.sh index 881354d..536e56a 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -59,6 +59,7 @@ elif [ "$BINARY" == "PERF" ];then export NBYTES=8K export WARMITER=2 export MITER=4 + export CHECK=0 elif [ "$BINARY" == "MS" ];then target="./build/ofccl_all_reduce_ms_perf" export MY_NUM_DEV=8 @@ -68,6 +69,7 @@ elif [ "$BINARY" == "MS" ];then export WARMITER=0 export NBYTES=8K export MITER=4 + export CHECK=0 fi export NSYS_FILE="ofccl" diff --git a/src/common.cu b/src/common.cu index 9c2588a..716362b 100644 --- a/src/common.cu +++ b/src/common.cu @@ -652,6 +652,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t double deltaSec = std::chrono::duration_cast>(delta).count(); deltaSec = deltaSec/(iters*agg_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + // int cudaDev; + // cudaGetDevice(&cudaDev); + // OFTEST_LOG(TEST, "Rank<%d>, time = %lfus", cudaDev, deltaSec * 1.0E6); Allreduce(args, &deltaSec, average); #if CUDART_VERSION >= 11030 @@ -673,6 +676,50 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t static __thread int rep = 0; rep++; if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture for data check + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); + } + } +#endif + + //test validation in single itertion, should ideally be included into the multi-iteration run + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + // End cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); + } + // Instantiate cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + } + // Launch cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); + } + } +#endif + + TESTCHECK(completeColl(args)); + +#if CUDART_VERSION >= 11030 + if (cudaGraphLaunches >= 1) { + //destroy cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); + } + } +#endif + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); //aggregate delta from all threads and procs @@ -733,8 +780,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* setupArgs(size, type, args); print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); TESTCHECK(BenchTime(args, type, op, root, 0)); - // TODO: 实测是否恢复? - // TESTCHECK(BenchTime(args, type, op, root, 1)); + TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); } return testSuccess; diff --git a/src/common.h b/src/common.h index bd84d01..745bd76 100644 --- a/src/common.h +++ b/src/common.h @@ -16,6 +16,8 @@ #include #include "nccl1_compat.h" +#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) + #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 4bc04bb..6701244 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -822,9 +822,13 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t auto delta = std::chrono::high_resolution_clock::now() - start; double deltaSec = std::chrono::duration_cast>(delta).count(); - deltaSec = deltaSec / (iters * agg_iters *multi_iters); + deltaSec = deltaSec / (iters * multi_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec / cudaGraphLaunches; + // int cudaDev; + // cudaGetDevice(&cudaDev); + // OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters); + Allreduce(args, &deltaSec, average); double algBw, busBw; From d9f1a554abf857c00d5896894ef7ed5c320db43b Mon Sep 17 00:00:00 2001 From: Panlichen Date: Wed, 21 Dec 2022 09:33:10 +0000 Subject: [PATCH 070/109] + run multi test scripts --- .gitignore | 4 +- nccl_test.sh | 20 +++---- ofccl_test.sh | 18 +++---- test_scripts/nccl/run.sh | 38 +++++++++++++ test_scripts/nccl/static_nccl.cpp | 42 +++++++++++++++ test_scripts/nccl/static_nccl.out | Bin 0 -> 43920 bytes test_scripts/nccl/static_nccl.sh | 28 ++++++++++ test_scripts/nccl/static_time.cpp | 37 +++++++++++++ test_scripts/nccl/static_time.out | Bin 0 -> 43920 bytes test_scripts/ofccl/clear_static_ofccl.cpp | 42 +++++++++++++++ test_scripts/ofccl/clear_static_ofccl.out | Bin 0 -> 43928 bytes test_scripts/ofccl/clear_static_ofccl.sh | 28 ++++++++++ .../ofccl/clear_static_ofccl_time.cpp | 37 +++++++++++++ .../ofccl/clear_static_ofccl_time.out | Bin 0 -> 43936 bytes test_scripts/ofccl/run.sh | 46 ++++++++++++++++ test_scripts/ofccl/static.sh | 21 ++++++++ test_scripts/ofccl/static_time.cpp | 32 +++++++++++ test_scripts/ofccl/static_time.sh | 21 ++++++++ test_scripts/ofccl/statics_ofccl.cpp | 36 +++++++++++++ test_scripts/ofccl/statics_totalCtx.cpp | 51 ++++++++++++++++++ 20 files changed, 481 insertions(+), 20 deletions(-) create mode 100755 test_scripts/nccl/run.sh create mode 100644 test_scripts/nccl/static_nccl.cpp create mode 100755 test_scripts/nccl/static_nccl.out create mode 100755 test_scripts/nccl/static_nccl.sh create mode 100644 test_scripts/nccl/static_time.cpp create mode 100755 test_scripts/nccl/static_time.out create mode 100644 test_scripts/ofccl/clear_static_ofccl.cpp create mode 100755 test_scripts/ofccl/clear_static_ofccl.out create mode 100755 test_scripts/ofccl/clear_static_ofccl.sh create mode 100644 test_scripts/ofccl/clear_static_ofccl_time.cpp create mode 100755 test_scripts/ofccl/clear_static_ofccl_time.out create mode 100755 test_scripts/ofccl/run.sh create mode 100755 test_scripts/ofccl/static.sh create mode 100644 test_scripts/ofccl/static_time.cpp create mode 100755 test_scripts/ofccl/static_time.sh create mode 100644 test_scripts/ofccl/statics_ofccl.cpp create mode 100644 test_scripts/ofccl/statics_totalCtx.cpp diff --git a/.gitignore b/.gitignore index b0853be..5999837 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,6 @@ .clangd -.vscode \ No newline at end of file +.vscode + +test_result*/ \ No newline at end of file diff --git a/nccl_test.sh b/nccl_test.sh index 2243904..b5ca1d9 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -9,21 +9,21 @@ export NCCL_ALGO=Ring # export NCCL_NTHREADS=64 if [ -z $BINARY ];then - # BINARY="DEBUG" + BINARY="DEBUG" # BINARY="MS" - BINARY="PERF" + # BINARY="PERF" fi if [ "$BINARY" == "DEBUG" ];then + target="./build/all_reduce_perf" export MY_NUM_DEV=8 - # target="./build/ofccl_all_reduce_perf" - # # export CUDA_VISIBLE_DEVICES=0,1,4,5 - # export SHOW_ALL_PREPARED_COLL=0 - # export NITER=40 - # export NBYTES=128M - # export WARMITER=0 - # export MITER=2 - # export CHECK=0 + # export CUDA_VISIBLE_DEVICES=0,1,4,5 + export SHOW_ALL_PREPARED_COLL=0 + export NITER=16 + export NBYTES=8K + export WARMITER=2 + export MITER=1 + export CHECK=0 elif [ "$BINARY" == "PERF" ];then target="./build/all_reduce_perf" export MY_NUM_DEV=8 diff --git a/ofccl_test.sh b/ofccl_test.sh index 536e56a..3465366 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -36,29 +36,29 @@ fi if [ -z $BINARY ];then BINARY="DEBUG" - BINARY="MS" - BINARY="PERF" + # BINARY="MS" + # BINARY="PERF" fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=8 + export MY_NUM_DEV=2 # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=0 - export NITER=40 - export NBYTES=128M - export WARMITER=0 - export MITER=2 + export NITER=16 + export NBYTES=8K + export WARMITER=2 + export MITER=1 export CHECK=0 elif [ "$BINARY" == "PERF" ];then target="./build/ofccl_all_reduce_perf" export MY_NUM_DEV=8 # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=0 - export NITER=4 + export NITER=1 export NBYTES=8K export WARMITER=2 - export MITER=4 + export MITER=16 export CHECK=0 elif [ "$BINARY" == "MS" ];then target="./build/ofccl_all_reduce_ms_perf" diff --git a/test_scripts/nccl/run.sh b/test_scripts/nccl/run.sh new file mode 100755 index 0000000..0e63f35 --- /dev/null +++ b/test_scripts/nccl/run.sh @@ -0,0 +1,38 @@ +export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring + +export DATE=221221 + +for MY_NUM_DEV in 2 4 8 +do + unset CUDA_VISIBLE_DEVICES + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + export RES_DIR=test_result_${DATE}_${MY_NUM_DEV}cards + if [ ! -d "$RES_DIR" ]; then + mkdir $RES_DIR + fi + + for n in 16 + do + for w in 2 + do + for m in 1 + do + for iter in 1 2 3 + do + export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" + ## Time + echo $(date +%F%n%T)>> $RES_PATH + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M #16M 32M 64M 128M 256M 512M 1G + do + ## Test + /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH + done + done + done + done + done +done diff --git a/test_scripts/nccl/static_nccl.cpp b/test_scripts/nccl/static_nccl.cpp new file mode 100644 index 0000000..3c8b2b9 --- /dev/null +++ b/test_scripts/nccl/static_nccl.cpp @@ -0,0 +1,42 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + cout << argv[1]<<" nccl : "< a; + vector b; + string line; + // time + getline(cin,line); + + for(int t =0;t < 25;t++){ + for(int i = 0;i < (7+ranks);i++) + getline(cin,line); + + for(int i =0;i < 6;i++) + cin >> str; + + a.push_back(str); + cin >> str; + b.push_back(str); + + + for(int i = 0;i < 4;i++) + getline(cin,line); + + } + cout<<"nccl test algbw:"<v&HYXGg@Lhg6%z4ke0 zPG(Y!!~MKJa@xr{d#$zCUVH7e*Zy&4&c5ASzdFa|($tl!eORNEdx6AMGhxq{_y(X_ ztJ2QKU$=IFHVx@Kfs^!V2|$(CNKPs&V0s+r#bcNa>$g7$o5o8gU;KHQZGeO&JoLMhN#Pcj$f zOLD#>=ToqV?Wv%$C+SF^DwgYnZqVp*i@%!Y%(sE_IbkjHD@bL~)%>#2KAG|wIbU9l zl;ecExqb?&@~%Zbvdg=@*qmAumzSv>O1a(&y10N3b%olNRD7tbeQ{SP9PeG+TeW2I zl8Ta8q@+x6kX}>UAsw{=y99zM4VL`!eWra|ZlxGRXf$2K}$j!2iJv_-8WU@5@kM@;f&^&Z{Yd z{@=)er@rHM3PJj+4EUdAu;*Qne~va+tL~A)R9yQ!@E4_#|0LuP|K3|9u^4H(Ze=-* z4mm@x5Aknjd=c|&hz~KNJ#hKu6cd(cy57+p3F|S#A2oDc)9YFq^!8vh*b#~u!Dvgv zs;)>l*y3;N3QAgf$_>p%S*flYozcivy(<{*Fgo>MG#ZIEt_^59s9mwTK(pcT1Umgu z-H7@_1`yt6KmK@|KqznX$3g);5;LMfe|KGr-cqW2o0sZMYu7a!OLaZa+w1Xo$|M(< zLg9|O<_f*JQg89-UN6eX%$Kp?RUYpapZayMvraaGpLX96yFq##k+$PUjiz$ zAgTmz)&re4>znhn0I9*m58CanDyj~ zzj^qZi}^|}<}!qp|8lhH0-rDVXy#PFbDccE9AN1qFBfJ!wMeLOqVbY#XA%F8nZX54 z3dKZk#AnON`!Qlo(}p)((qMKd zbhZY4PcbfjUg!*(nN8#Qx|+|c@kU!nxioEr=j#gp#v+M7>c9^!koW=~R}{ZWABjjh z&H3E&R{R~d;aL@dAF<&-VB zJSH7!&Y&(ez7pO(KCAJS@N629MUB&hzffT0pCZ<0I+9ejx{7UhID~n*ZFmgn=C#O% zXIBukQXAgB?x?ciscdys+wiI_iRQE6;XvlqXv3drVl{1p4G)JmuT3`m3=^wqoi;oi z*1URbcwV9kTCWX1)8y8)?KV91DRu3#;m=V>z&$p+eSZCb4gWqH|6UuO)tXT#4? zNWlFz{JA##0UQ4PHvFIsf1V9LbvtE&DGN+lV9Ekh7MQZYlm(_NFlB)$3rtyH$^ugs zn6iMTy)pCrzxn!JEAaK_|8hOHD|QYVIb*}VzQYBFgn7m)|8t$DjgDRXJiyEg1b-({ zMvwhwY;0@~<%+c8!e;9ZZXre!7_TxZ>DKuW%Ss!W|}rsMvv8)Y1&8`J?1gfw1F~u>_Rh58z-a3 z&N0)pVKREmWu|9}^q<~Q^`#9G)Za|g#t73^}OJ+Id#p82RR@mpWtOD7sz z>Id?tVXM!bTf90nLT>6eLal&`-4_EsRp3L>Re&T!P9b+~Gcv(Qf*wM|R-Bo_-f7@5Q z4)@UFm$&-*R*bI2tuOJiF~irt;u2&_KKl09Sh5|?{FD4Az~{ONdCl@E|FNwg(oHM! zHT&*(&4&)0_~B#7=SzHdKR~;$Kkwp2La;aK8T9qfnN14y?5h!I$_&@@IrJ zH7JJkncvf9{%Ei-S(@1Sf)KFNO@%x9#R3G|M3QDr_{Tt=` z8A`I)H6c{6KZE)X?Ho2srn_8+u5psE{lBPWBBSI!QF)h$l4z=1u;~g77q0{%|Qe<$7f0;m9MK9qct?7r;K%zFlR3>s)z$uVql2xXrlCgCWLLw0fi zD#@a9t)dPHJt@2E_cj+JOl`uc7=p&9R&RztGCL0N2WhO$$%O_gPuPK5RuQh=?m zJGi|`@|u|vSoM9?9>~v%bK~Iq?rqZ44^3n&^Z#KSRcs~eWl@~73uPm_q;VW zX6szAv_n>srs-%X$~9ZY$Nc2Rlw1ekBbv>c&FJEtFy}smm%YA3>jQR?Vq)dp=Np)F zEfx$vqO4Tkf5+;{m++G6`^YmA-XroBj+yWdi+iwrMwrFB-#5^Tj~CKK?T$$Fh|JPD z$mq8u`Y6yxA>{zz160_zDOx9}n~nGm9!0l>e}ltC07{E)dn1hzLtZgFBws)R!}Tqa zK9qcihC~`3NQ0r|e42{%m%>U^)Dd6emxTSMEd7A5&wIeFa?%XSmh*SSBRT?7 zgL58_sB8nN*&Y$uoCWSnS738$&S=^K!<;ku2Dtl9xKLM$0I~WTE8456$ai_`5#PbT<%3PJ{4-6EBnzxx#J2~02@xX(rRAv+V0i*4t+8kiZfGc$ z6Y-flJ`4Xi){j4OFda&|Mj^HDwgXyx0a&^HQl$u&cmob4!kDN;gPN>0kP|>1QHeTB z!$Byn(63+ul>92n;Q>7PLn>1a+mGWmd6UrLaHoCDM%1zgn7-HCDFLOQqlnd)Z-s)0 zVzxj~WadZ>8QUD3W`H!~Kuv2+3-!81kT8N?2S-A4h}IfP(g0eQc*56z6VNaI*bQSQ z|MC~Y9j;IOp+52Z(#b_6;NLZ<`cBLPT5$!A4}Jft-K7n&wrddQdfPhbWyfXUf1)_Ul}V4n|t z74J4Nr82XAOeEnK1Be0xfHFY*2(_Vp<^lrE&z4z1%*(t-qq7udEhz%bmO_PD28@{E zAeViTn4i3!wCFqRvwMMT_VMu;t#8HR$0)R9fJ4cDMdSJU zi;;x04kaHJNsP6y6}kduAbp36Q}wZ&N{ne(r_Rm$u5oWpa zRU$crVkr_qt7j;PCtgh6OayXJOD?Kl)jYL3#lW4~@f=;12F;=5qCce0J5vP0hF5t{ z7;}X+!MswcmZUHGBi^5MAAde&hN3cy?vJ5?*_Q*RnBAtc9QePF7KRN(xcZcnJ5}!l z0vH7`5o+Qmlxm+#m+LehN;6;Nhm6$OLi_6}t?3MMLW6OE?w^GMW~@qW8ENaT*HUe5 zM!0d&kqvm2BZG9<5vP+LKuK;5*-5*rNqH{Y~i6(e3_7ahuw(U>0>sB0SeeQGnd*`_=$JSC3`vf6`4vn z;z5YER5u%xD~{0$RI9zoVrE0w${J@$iUn0D9yR6>EYUECVvm@uca?Y8SPqP^$AWRZLHW1XO!%{jmc&;A0bsY3ow5KA)<&zJ6fE z8y-!oxvnnpTup0T;;ovNn#8GA-@xM6fNrk8^ld0pkQ{pnr5t=a*SNs*JnzTXCq^;# zt4;jAW^DGaeSL>qzGc6Pzlz!w+;|gB@@hU>qaWIA6-iA-4KJLRo}-yh&V)u&f2J%j zWq~OR{C{f!7k<;CE_H`NT41sDfnVHMzrm@-{dRoB4=#wA7;v4I-^palLsKFWLtg=) z$Ntmtp)$|FhaTgfh0hFp#AFqJqMBcGhXa8w_myr5NlTEL$Hwgcv2 z?z{tVHsHg6>i`b}ZUTG>@Y8^KF!WP^3jyb05?>401V}%yZv)%`cpYXIj{pV%p8~ue z@HN1LfHTlY&jMZnxCSfp4S-#My?`6BBk~a7KETHSUjuvrZ~>-XrvPsSoQDSfG~jZ; zR{%Ew7GaIC9dIe&UO)qI5b$$=M*)ujl2g0`=muQ#=Ga&@;C8@GfL{XK4)|Tby@1aG z4g!7{`>{s>8v)6o?gMlKz5`ebIPcG>H{i8^I{qJfO`NFfcpWT0UQP_#74>qz_oxy@RN@MmI8hauo3Vn!0!Mq$29X4 zp!#i$>*GzDtGCECcgFOBJ?LPB$J21F2mbjXK&^JV*t%bc+X{R(yny;sM=UH_U0A$k z=2=?{wrf|PcjaZ}3oimz^>iZ(*^%hi0_J@I)>V4s_=FJRq2(Ic!bd!J*m!QY%2|&1 zRPgfH*zMr4WNb0(u@Ur6&_8IWf7HraGH(K-t&K;H~{ zruKY+>1B5LAuInW(0h@8rk%dpqR&IzB>k7!={**GIq3g1fqol7C;dz8`GZ#eTRHy< zJKeD84}rcH`DvUm?UQqjU?4wv4D?~pm)PmuR{j@2e+Be(^&^{q2lamr^fv+XzJy?_ zZ0vvk?CJMT{Hi;y&0-K(71L!t=`8feg)-Ytf3f>FtWo?jjH*hCFe+~2#b~@^t^IJ>CY>Zn6Fb>w+ zd2&_>2C8ob=r4g@Yo|j7c|OrwKrg`fxYSM$Tk;LiF97``c6v^uDgR#3H-Mhb9&Y6M z4(PNCvB937>R^k7BjEW2crw}iB2;%Aonqx4|QRAFcXst5rvu2ki$vlZ|M8^uz@DY2I`g^kTaX>n-^-pLz`RbT)J& z&qmO{3Hla$e)7{!DPtku&JfXqT1KBpApRM?ilH z^egOiTUUD#$?yGB130?%cH>!xJ!9 z{_6$WH*@}XK3Y6+23SB5AKX27v1{&CH=v1*R-8Wr5RKKx`=B`xsb?(xu+Br`DoQNPDGpDLlOckggdLXqsEzoXu(9Rq{rx z%We-5V2bp^BAMF{?g>4UUz4gvmgl_g?FO@DVgT*EGUvP)*Fma7A9ZPg^ z&0V& zZfCfMp_DTDzv{h{N&;RGdZl|&Yg;^Q#NEsAcF@wrp14SPK2cU$Qd&{sxm4m+j?6F@ zy{lL?w2h744qBBKpR1|&(A(JI(|OvhY5DWbew4~TO;i0R6+d0>bJ~%(&xNZh|Fe^F z;qA(AQ}JhNs(+;7&(c&sO2xyu-FA$;&&6p9?zH$qZEu?P&DCaV`_tl!%yS#;rR3rb z)+%nL;<39~ZO6#_TfS2A7J{^bKDZ-nd^> zEzoAUiy42817FSfSq^*?<7YEo$-jm1#SZ-4jGyPg|10C&4m>@RCp{NB@PB3eA_u-0 z<0kQ6;lR_coe5v+z+cb!3I{&U_$mkfUdAtX;D5mQY6t#zjIVXz&&rkh_#F6h#@9RW zH!{A_f$w8{iv#~<#&2-o|AX-x9r(X7ev<=#F&dQY*6zTsV|=FrA7gx%1OE`?dmMQB z-6P329C&)(O!!_0{#@}KVwQHR1HXdt+Z}j%mO*lMIPm?9-{ru6i}AZ1_+K!7j{{E= zK$3s21AhtQA8_F78UK(2zlHI89r!OW{$U4xi1GUvujX^F15a`3%ZoH|iRV_hn~#Yi z;njRw&5O~E(#9nZ_+1!BX0MZS?0?h6v8%;Ibo_WU>mn+5{CL_VKiJRoMoKhZQ#?LLvaoaNc^8$A{jfF-(dVz6%w(W@y`K2 zi|_4VJto{F=lshgRy+sC?OlxDacL_4CB{D|zLcbCO3(T;rJV0EUdh=Fe35pZR_qv0 zKg0Z^ET6M!e`b6O*K0K^yB7l8kmDSeuRlwcyUrsSFJ%6wfIo+7TrCmkp5l6*`P&`! z`Y@V2oqgJYC;MzKm-!XXFEjA}MeuWBHt;^IcT`>*TBGkeVpP-4SSYddoiV!n8Th{n zJn1vLO7cA_fmU89@gtW>#O;h906ty0G><$d=RD0hKYxPdJSx6SV7+7g?3r9H`_%_o zew6V}`~M#JbmdlH3PSqlh%e3Hd#x(GK>lnX>?WyobQ1Mwg+8;dzZZCt?~Es>82>@;cOT(;wP8bn_&?0;axLTk zjq&$fAra#H7q~6KMuMt4vkI^hc#`ul2Oh=qFyajLNB08Bt=jP`%>Q%t6D5D%2hz*; zXTTd7@b_fEe+78b^Yh%kC0vODghImK!2W>lCN7=v&iM8?@T!i?D!_}tQyg%{=T}+I zi89H}-BnwI4Jj&jGy4O5r;x5+Gu|1`S7L*T_;c9NR6l);@lUY+O1F#WOa5YR$7NEy z)(^bmVUz&x0H5w0r8x*JRBk)VQFgnT@hvQv!>;xN#$U(vQubU5e;_%%%n!F0*Dl5% za`gAl15e|fbDZixoG-$9Z7|9L7<^s$TPluHZ?Sx5{8{Ce`aJ9yzwQP;oqfK}{OhlT zP+W><76wYH*Hi3QOC(vlA9&K!=?`CGe&_i9JK&W}79l_Z=26s-hPm7z6N`bTa%Zsq z=;q>bGk%2qB*ggJ7=I7PfenoRGvjx&V=5iI=(i;QLG~*S6WSi&6%UIL;E@dYpR$}y zWs+O@?FDcclE0D-reyA7{4o1lBNsBl_|LPRTNuCMVkzgFY#-%UcQd|~`PKM1%J@fE zpIVl4Cjvg{Q^4gazx@^PZsxIrzcRnU@)6d=wSmSpJ0sus08jFr{`MomU!+y@zI8tH zKh6Bk_`G0|l(U1|7v>e$F2>JdJE*w&8^$~1TNV6<^x4318^ww1Q;dI={Y14p#iMlR zvK4@V zbh2CwXFRn>^p37bo4-qM#}A!ixR7m$nqyBAr zFlv=z=3o0ORSPWPY;grYBuUU%X&h=KyYg) z7SscgFb*?|2XMlcAeD=sUiqORAEwc4pl^dLZZ<(xyKP+4^t#U@YLDh>Z(v!|8g_;O5Ns<3ih_ zU_32Xk3Sj=pO%YcO;CXfSpgg_c^XDmE$R<vcGp{T2GEA@IYDvKtFY@1w}UhfuBkf_A} z#KEX+I(5(`CPEn9C)r4~W#Y733iM_Ab>c_m;>6fVHMuf9AEBFFp*WghF=qV2ex5~42Y5F)RdwZ+Q?$zD6mQLoiO)=?E zU(7?hF@p}_&}LD{dmM{vU#u6i5;?R#uT^O)Oo zM;52_3sPjQ&D`&;H7e+oW$H=JLzAnkZKu#msyfKl8388|?_?uX=qWX)Lv1OZPVW5b z6j8D@=@9RD*c`cnI8+!jB^f+D<%%Yeh$H3Qku4bc{vn_9B5Cc$;v`H( z$BK(_VOdW+)`=L00UQooNs=cxr`%x+#drG}Hq_1`UxEuv;NKg~v1 zNEvt=(QC!g)wV+3+s=Tn%Aw&OI0`+*@Y1DeJjsf4K|vTrIyX#_?il%u^BRgk@2PGc zrJJq2vf&iV%)JYW<7@Ci6ekC)?OBhxl2&_t36>2d*rGVyq3eIw!Q?dZIqbDWbRAEr zzDn(;Ose;+5#8BV1F^}GjS#_;qGqGKI~1<#hBsq^S4RIGKJ#~3B9rH=Z8itkecWx@6M}%yc9L@=9%=0piub?^5ht8}NaJ>PEK=3&@6}_pdqqYQ;B6P=5pp@?7dhah?%#TEMc{WHMg@a%|=(OE@KdGA`^=}Lp_4kT+rWL zhh=Q3?uE&k)@JTS%@sO=23gSSh)Hr0mCZ#^y7h{=*vd*fdC*R3!R)2AHI}-x+LaCp zD_Cm~nx0O|MI7Y@wFAc5A{5OPJdMA zU6_CuTPWkaJ^+Ow1+hNGpP_kpYqt}%t~t^b^ER$-YK}DPsT}|hNrI7{V7ic!&H7vG zRcc}9tTY?r3#N8r9h_C?xB3yS6&B1=)L%O8Rf2U^hqO-v(k&h+_$)#0&(;)tNss%a4gjuu+wR78OhyPnixgj-B24EH`Hb}@5Cc5HYWm+xEl23My3eq z)_Qt!TUFTjrOjB(0%)d$2?1Nt*`S#fpPYg3?huPgHWobmjOr0IGdpX^ZX3mCjJecCr_aZV(3@7$4 zy)72AwfiKOj?$v!H&vlXOgv7gsKaxwaa#cxDZ4^#0X-0li>Fqw0;VmEJb-7#8Z<{X zmw{rni9#@RR5;gez>IyRK)aHExqLWKjlGY)RM=?g|A{^!W4{REkNICYw@>< zX9g(`rLwW&u`G?a*&5y&LN9I##^T*UtVvd_(#uOqvp2fP<&_n+>C1LfdKP=D8sD?r zI7`)2emIZZ`4%BGc+2FR(c1<`8aElE^|*Xmm1XVKpo%iH5o;^VeNhDOY%cLFE7O-S z)J~4Dba}wsvoxptSsFS*y|fmVektay;%hV3qTKwB30lPr&XKl{;R_Ggu!RzT?_(g^ zbx{GD9zirBN;iqGndlwe_#%%~LJrcNs`1;z+1wUyHic+`Hw367iZo}UOXhbFKRCly+i}q+GSk(nf@J-*6o+vie zqQ*8Gsx2PsYF`{`XH?C~y2XaSLlf+s{#d70(!MQ>A|y1TlCmWjjp6%YDG41k__Cie z@Lf-rp_R~sdaVSvB^?pIi3I~%3HI5M6cM{58WCT5DG7G+7ot1cAx))FOdz21UJB}n zflEl0-<6g!)IAFKcZVQe3d90JE5Y})yYb!Yly7`bDuJK(XfAwmIJbK5hJxxnG)g$} z+AsRP|8#s*{;FI_RnRSy=3n*uGlig^TiEk&;QR`z@9!)AJaxmc&XHfe??XYigWs9| zdeCuXLqzGX-mjw||17|Kzm8LXKk}c2kIFy7`YX7J(^N;YoyxC%pAiC%SX6%XJ|6|u z`+LA5E~ou(0hZp4qw=fwktpcneA5^zJ_SDkJiTj2rPX^&6x_@CNPp5(>Cb3w05@b) zm0!KzM8WN>uqt2WSM|T2^RHxq>OCk5s`n33{tw{e%>M=8s0bxrb0a~QTl~c@o>DJo z`#%J-J-^#6`4v?9D=9^Mqu^JNw&z#xUr|u;Wy-(bkzc*1MM3)+8ze)?Q}9toe)aoC z1us^Hrp&lhKAFO~GPtqpuipEj;Mt0iq0@d(aDLVPS~W8=%p>8pOSPXu{S-IEqVlWv z$SAm55jcah{O29{)%#`?+;t{rWZ*3SWk-JXUK$0RA=X*`UyvU^trmTQics$b7~V*! z@%C2bE0~MQ({H;tOqlN*ILi4IpAd#WDy`s5?{b$x`K4_i(;V5^i_2f3-xL ze;081r6xw+6mOCcKc`B$NLRY-&{2e-`!)%nZQ>vB(eL$ie9U@FvUsnV(w$KP?2F6P g57ZC4n5fECy!iRLS$Bma82F-0(U~>&%aPRn8}@h(vH$=8 literal 0 HcmV?d00001 diff --git a/test_scripts/nccl/static_nccl.sh b/test_scripts/nccl/static_nccl.sh new file mode 100755 index 0000000..761ff36 --- /dev/null +++ b/test_scripts/nccl/static_nccl.sh @@ -0,0 +1,28 @@ +g++ static_nccl.cpp -o static_nccl.out +g++ static_time.cpp -o static_time.out + +export DATE=221221 + +for cards in 2 4 8 +do + export RES_DIR="test_result_${DATE}_"$cards"cards" + export OUTPUT_BW_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards.txt" + export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards_time.txt" + echo $(date +%F%n%T)>>$OUTPUT_BW_PATH + echo $(date +%F%n%T)>>$OUTPUT_TIME_PATH + for n in 16 + do + for w in 2 + do + for m in 1 + do + for iter in 1 2 3 + do + export INPUT_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" + ./static_nccl.out $INPUT_PATH $OUTPUT_BW_PATH $cards + ./static_time.out $INPUT_PATH $OUTPUT_TIME_PATH $cards + done + done + done + done +done \ No newline at end of file diff --git a/test_scripts/nccl/static_time.cpp b/test_scripts/nccl/static_time.cpp new file mode 100644 index 0000000..444446b --- /dev/null +++ b/test_scripts/nccl/static_time.cpp @@ -0,0 +1,37 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + cout << argv[1]<<" nccl : "< a; + vector b; + string line; + // time + getline(cin,line); + + for(int t =0;t < 25;t++){ + for(int i = 0;i < (7+ranks);i++) + getline(cin,line); + + for(int i =0;i < 5;i++) + cin >> str; + + a.push_back(str); + + for(int i = 0;i < 4;i++) + getline(cin,line); + + } + cout<<"nccl test time:"<SKDjvwLY{|gP`HQ*V@?URqBtHw*HuiwlUQfTQv8(_Fns( zGbb~_rm_FL9GI-L*IIk+wbx#I?H_06?AyIH%Q9RpOQ0nd_XO+lq0LCTlzI+8Cb3Tm9D zeAz^D(;r-J@z;b~&y;Be?fFD`70mB|x8+K_g7$o5o53}bKG>cNeO&Khp%iGWCz*@# zB{*M#^C_6m_Eb>WlXRp{Im>lIH)wRZ#a~Tx=3CAAoUn@d6{IrgYIxmfnMiqcobSB~ zAx3k;ZCpPERe7I9KC;Wlz1W;uJ(riR9g4W#3X-4Dbwzuqxvb=h_Lj@rL*cIO%e%|V zE-x!7j718I1)cPwGN}2MuV~Vsc0b`*oM?+#oF@AF@1B!)<2Q1Dbl~Mxn;L!heQ@(j z4|^WHT2+>!?Wn>6xYOr!rbY4|TrgMTg!{@yh8CBJjyPpRcJ04Yr12Mljk~ zyR!1vb=L(n9n|(%b)doUcmi$y zsBT34Ap;0+gCBpq^&l*0_Qyg2JrXmbL4QYequyAgdm9$&_1CRvFc#{1pu5}S@f1rg zFonXc)eR+jL#f{A(Y;=jk)AJY!Am{f^+LOnV7R3{m4f5wRt95Du+Wvz+>yE5UNbhr ziS=F(2!*Af@=zqELq@QqIvg^pWW`oAMA~ECx@GkZkp`V?1Vh7?vXz?sExNzGJraNk zU|zr8j5ZJJ0lyJw18YlrowUuu)}Vo*B$&!PW7-c+<7mJ1&63o7dTV#L-WiO>B4K}f z$k?c_FLGEYx$VcZP5pJ!zQXIqI$f$Z4K%{rN_FY2;SC+Bj4HiFm?|0+Hk_D-X5(0X z{WrBf=Wj4d0+B9*8xIa_HeG`e{dm1?qOFAx(i#ZpF^V7_O>YWs2%-4;V63YnNbx10 zLJOiw;1)g5c8k8wA8OYScgSCb3H?T}J7f@wDzPpajC2MSZC!|*;0uU2I!WW;)Zs_w zfB0cHS8A9=YjBR2dT8$Myft2Gvphi&+?ZT!b<_;YOdeAb`ZqQHh% z<1y(-a|U&(@s;rQ@mY1*R-8 zWr6>{E%3*@bN=G%c{|6~oBhj`*sj>xZ)A)P`g#uL91w;XEq!T)rj3kV_%cA=xf=1e z5oKiPH>0DY+eMl-U`B?XGt;#3GBUK^Ow)$T$j}eXG;Oqu41LE;)8@{|(A{R5He5!A z?l9A|(K0f0tC^+^mXRU9nWl}Ek)h9;Y1&X38LBkXw2?A04R`;n?IZIGb;W|}reP=7N`8zQK`nWl{p)Za|g1_yS}kv`jzp6y7V>PTlh(jV3& z>-)YV{f;C3x+DFnBmG~F^v@jWryc2^IMP3Iq`&V-|Fb>qd8a!5+|9oDZ+$(l4c9f+ z^ktV}>&e%5>XT?FU*F>WXs*%O_aMuA+bM|~Utjj)1Qs1LW}|C`=^kC{q|qaJ=WYhO zc7P%H?-u-}cM-VcBVT;jxBvIo`1T*m^tqn#J@=6@3j(|>AZPT*I??VbKlQ)Oi{F8J zXkC{z`Fa+QT!&j<{B>i7uXpi9$d-8Q!_m=13!M3x?5Dx!x(RvB@+tq&1`z3{3Hcg) zcf9RG2af+_AM*L)KYAFT#n+p4;XEPOoAC7edS}lfg?hIV(bvexy4#Rn40 z;kn>B0#z)Ye95zzdFBe9BmwbfV2z{5lz4%}_LY8$c?X0hPr(jnlSuHzUrhXhkfsI& zke>IbmiI(|PogNk^%WsttD6deLT||AT25XplSc@E`U}J-U*A@ZsRv}Tf|LI$nPrkR z43SZd_;0&<+ZtS9>$Tio3Z#k6o#mWBvtJ<`Gy|BPxE~R%X-T4C0 zh*cR#6haT*q62yN^l#}m&{UFR(Bu%x(zjr!Mf(=N3)zW%P)Qb5Y!&sU(37&ej@}F1 zW=Y-pDZ_(i2A7rLQ9|}+|G}qZe~nmoo2)M|0dCYwNYyL@iQ|6}NsSx-KJ)}r-+{yq znOw`%KS*ZTKm&<>B=?d#_456d7~b>an~ul3j`eIh?&>POW79G6+OA7r z^GB(96GE5SA1tT3mfi;xZ1*yyAb1Xl+yB!P_5@{pk_P<5E;x#42D7P(UK}eI%S1q0qNbHWW%ruRj6(Ak?;1 zNQjntqX<$5T8&l5ZcMv2dteHs**@Hlr`faP(R=GmQ(W?tFFuF?AX{_{qx6+}Zo>i*wm z`&@$#e?w6=jH;bbU?6cPLQu-m2jEKFyaN?9>6A@9P$8a>IeZ#PyHC1J$BH*4d_9|n zHRIFLYDo6hDvQlYQ+ATJ$+Aq-iTF7~3b6U7gWH=RugMc(7M_5TExU7Or``x&)_8x6 z#(N0qLsRsjbHqilK9VQ;O6@T#)MY2u{AqO5iWby&9kLQMxkW=!u30ibW+yI6%C#3h zqS>t305NSa=N`nD-M)C!PP<4k9r5n*_02vB)9NQEE7kY^&>$Z7lInZNGveMu@)nL6 z_YR7Cu>F%Ti}zt)UlTrFNEfv`B+-XumZpA2zc0~8fIb2#djWS+VgF9yI!@i}uy6kn zbX)j0I9vpvwCJ`s(wH&e72`wV6(lew-YV$>iH~UPr160?7)YFr1(C0}2v(w^4*BB0 zB3e-W-o0*>^I&q$-w}`K2vDwTkaHTG^Jqk6>r2k|kjUmNa8IfN85vyEd~-Zg#hwIY6VnIm?xL8qOF?HUiJCD%bE`P_WvatY>MUEGz*a| zuzo(@?)Ak*jOdq^r$&I~381vP{O!1*u~<%D^R|2q{xQ^xKXNo3Nc`UsNbT9QSL>P! zR&Kv!DZ(ZG2!|43OjM#)&4_Br380RsM75=1KNMH!H)w2*{ZY=b(aec2UHxV z;V^m~90|=KT5BLd188;pXzAVH;kG1)1L`=Y(HIPb>FHI+BnO10FrW(M(mdt4=*d|*2So%*5E-m#GYq=CfqqC*}~ z8vpYLuBj&jr=(aRp$~)o&0)~Fb{mNoU6GTY7fJXBGhyZE7iV{D%S}-lxDuj4;jg` zg+kaQN#7J2j0tq#77EzsEVf!oTbI3)Y-=;ZjfswIz-&hb>9B{LPP!MNkFHGg`h@fu zEWN05aTMH#UeHJVO7w&o6x3-bPOkD*D9OHhNu^q{6V&Yl22A4t&57V#QYI6_J4ueOfyFCvMKfr(Ole+@Ri&?z1uWY0l2 zqIT+-&0&B7w$02%b`_rSj=E$oC%+<72}j%y(U$6FSL2Ffv;x&?Z?c%#5Vo?$SdwBv zIf_S(c?65s_M_NCX6s$$9W<^6M%d(8p4o6{d{QK73fC{gf-nB_#Go*O_plqCGT}b@ zxTr+W!E#%q7+b0;u@7;^8bXdh6dH1oL=7a|!i{L)fmsF;6{7fG(d`_fh>Qb?7o-9x z`(ffddKHl_W!*!e{Y-nM#&7LJim}7^U4VljWbn>o&*`#7UiEbjARw#J@axg=*H9xl_^&N8w znNK;!wl#Pxq;r51p8XwCQ-%Czh@s*fnw|+=d^=pkySzz*@5*JzS{t- zE&4l^5IO&csf5s}os!rW+tKeV*>qvk+ezL$dl!Nsgts0-7?Kr(Os-fb!YNnm0;(m`+5$ze2ZS~dJFZ+x$!2N-c{aQsUKLkt}yT1 zEmWK+kD6S#Bt3UBpEL>b)SoE}Oj%&c0#g?F2U);{-?XSp-C?>Ru-L@GFK(>g;8fs# zGd|)67sN~qxK7CLWU}R<$sLKIuK>_v|LOQpnJ3{xkMU2zX9hlE+KoR^&9Aw`fk3|JUtYS1)~OJx#nbEbtm+%!~OhU zkB;uc{X<7aM_&Wn2Y4KC{%=P|?+2XyyV242pyZoxjE;6fk&-ttVF0WJoCSOUumo^D zU?boIfCk_*fI9%+0DJ^68{_d=sAvh`)qraO*8pw?ybbUdfV%1=a*8raa z%z|;=0K5S37+@7(0j6~=fY$D(pz-!1?kphb{vAJYXH*y?~v7j{|N8{5jymfS<-QwL!o}z+u3<0rTM} z_X8FI4g=N!&cGJYBY?L79tTvvg>ikMUUPNlyXMT8p0gbtitu>)t!;*XUID08O&1Rx zF2HRGK5eg{{?zeu^OxloT$^{whMdjXHD_LV>4FQ+2Ug@?1K9s6^dJOZB-g-aFYuj| zpD!wF8$O=bMn`FfUi0SWe<@?>sncqq5RycH2%n2V7r*OK^lL5pPeDHubfK@Jue0cH zgMKOK;`d;RzRIG{fIj2V=bwNM@tW%kbv440)4y4InWoKA_9VN}=M31Bp7&?v!AX=3 zzMjm?{h2}$s^<=r^BeU0boF=y^kL8;-;&X6)#G{4kAt47o^E6zI}-h~fLY&xb(J0& zJ|Tp7XrW2A@Da})HlEw9a;`>vS^=Kh!DGo-Z`NZC=sy7cVmtk2EB|evKMFe8#ms-B zMc)PbmnM``MMc)VdL!hUsC)x28reg_V?T6w}Zago}cPqi-m{5a}Ye~Y<>*%XF*TbHw$3q z{{sDLdl@mS4&|VafPRskev?IC4LZ#g()oTj=+iJaNawpdK|c%hbZxK~bRX!Y_VU}U z@(+W4Gw5^e^cyVtG0^>>r?N4%VFBh24WOrsE#;usgC4TW&sZx8p!_sP*)@)Qn!9`j z^kw$^o1}hj;C6!k2U1bR9f(fsHX z%)ip*r+L%KpcmM6SZT?p`BVnzsch&*o;9Gqhw|6k^OK*pNf{RaPj*}e8IR#Us{w&X z)r0sRw9488SucX`M!T$xdci=pI0X7k>_E@A(`|7f0s1+hQ<-KNwy`D$!}0l`&$iQR ztn%l9{%O#Cb~<$Y0$8YgAL!SBo-UTRfxgB`NB#)szZ3M0pr^7W$=?HdJLpOM0r|hk z`454<6ZCZYCqR$9G&+j!#ai{xs1g-)1C@h0=PuC2Z}(Lj$dE)iG>Uce!BdAaf`nL> zoB!2}>fC~_WO{SmeOcbzdE2t9a*Mt+&6iu=Gktk(ML4&-GPkHQcixg*_mbR#CAs-a za&yG{5)8=ch8%<-^TJo_PQUK+cy{N?{_b?`2N}<1zB)~NVmeqr5g*)r^Fr60t8UP= zpSng|uV%T9X3oP8wX@HAE?fIuHaIcu5f^ggEWSoFUQx=%Az?221EHGt(DGN+lV9Ekh7MQZYlm(_N@V8n(Y%t)R$XJ@vrQWlr z)}~HKd#7|MJiP;ut{D<&np@tS%IWSa<&9XU<5s-~a1PVevlje9OxJcI+pC>n#qiK#3U}UH-tN}@4z?$oJ~6xmPqXE zh-)6-&~qTVxHwI{OKop%GQONEuJjhqoWO$Z3~{OQ#U2(giVt6T7xjZ(1qxqL7U`XA zSUk!VX{9F~AByy~OvmFfk>=*$(k2h6_o+K27P%#+f?*xQwG2BMZf3Zh;ckWxGwf$L z$nXfmVTS5uhWY1d;&L-AVpzejj^SE{oeVcK+|E!+nfPDz-bp0^F9^NTJ+G;`D{OSR z7vb%oMVEWJM9TBU;-bQ$l0wfV60dTkhneVI1*)NKZ1i@}@|5^YO}&TS#txs((r!!1 zpKbP|Wd3QI>OaZ&>0;m0j-&fbxT^9$J1rC5uKYF`f3l|fM>75tP4%N>Je=EY$H@Ck zoTlJTiOm@7d?Y=L7%-*iJ|i7qUR2-6n5wU zp5&3U(fhij92n+Wqg4H ze>daLa^U}i@oop6p3#$@7dY@AFn*o`Ux0Cw_~$$DS2DiHfxn*dB@TQS@oOCTzc7BS z1AieJlKd=2AwIq>TlzuSTT4&(24 z;0GALhw*AY_YUwBmtMYD6PI|7g}bvcfh4?|Z>xDRx>3rwEf-a>tIR^+NvGapXoJ=kzh-PCMh(JYMN@yO1+>-1!>t=cB&gzCv;qbA}Ir zPjwE(X)qx14~mDtn%2tv2IFsCAQ4wH{srJ?^1U6b#DtsVG+rXH;<-3(zsmT>=OyD` zWBd&9B_(|4f`!$bEakj?X)^yd;PbUJwF1X@`cKS1kIQ9o+MgJI7uRbU%ij&f+>qlO zm#@cyh4kN2BpJ_R{+|MW8r8T$BG5g>^)mB+!%?r#pvhC&rv-SD|0q{f@%%Ck|DOdv z7iI&W!g@#LwV^fo?juGu?W7ANmcBzqmp={vzW`7ATr0kWqG^vwc5Okf#OJd8Z)bcT z@TtnBdE{vsXKK#*`O_@t4<%B7@`qFNxLo$Di&=h@@lN}{34E$@OE3i?{maXwfOA;R zmw->jKbVI9{L`eIor|P^8kVyI`14RN_5OIcy|^9*K9zhzl0FB@C7+M^S26w>_D_XB zgo@A17(4I1pZPz}e#PBbdmMQDA9J0IFFR7X53t`3Ge;Zao&9kS<8zt6i}}yOf}7<0 zfZOFF#zz_dDz~H3VL#)yFrK@aRykY78TB50Ww%D)N&c$&QZP50_Cv-y z2EPM%lJATs#~FW9u`Ks}Pp zxW6c#`{D4^AKh~$w`#}lG5;y-Z%Y1IXQY<{!EL%M#=cxOCc zf(%=m3wFJ;e#@CTCf1LlX@i|bCtf8glv-vXY-JLfpnia4K-^;&K=ADe6#{N^s zg&b!5^Q`B3#xK55%6X03Mfug;jPGTBH9n3o{U8TNDKx32>4W*$5E zfcgKG|&vVhn2AH{aXA7n%QNE|=X)dy(;X z0@9^)z6J~=-{}wc0p>I?+~!408UP!B}HILxdofD^U^X@U4L zmLD2IPOC(gg<8XrD9))Oj&fZOMW`GkS1iK0df_fQwyvhw1MP$B{9Wyaj-PYgqIX8x zLxGL*JhmE75op1vpm0eZR7E!;c2tqSCDJLglXM(vR*Hl8g56R}f7poCl!gL0{*9!j zB$tIFv99J=T6#I;hdOZRR$7jQfk=Bhj_pdz;3@Jht*vYbL_?iMTBcHQ&|O+`d9d5S zS!ThOv>Xfk8*WKUENP7fJJV82gX5At#X7ak#&Dz~z;9NF-($_)K?LYB5J6pa~rB-}m@?*ehm zB?O+JZ+Mu`*w{Hv4=R>i!C1gQ9vc_hjMMd^!FB2F$AvaW!FWQhPJc8QJ|P#$8m9s! zvI00<@&t^mTGStI9Y-8<#X83=wA3sVy+7zrAACGz+zMiJ!TE#ZG^{F`jxc1aAi}o< z4R|aD3cUei%6c4A*h0q>8bLiILRJZaJr6B36d;a-zJP%SC#?E$L}Ckm@=HSsrHeu# zOAw{H(1~C|t;r!Kg6DbshKwT8$_2R79M!f+|Qm)tI zI9MJ6i}flpcVr_?D5`6ki@aWp%AyG(8z)w#*SlU6Br5TbI2e^prw+QrL0FoO=^&}LD{#QKy` zCf`3$&AqD(y~~Rl;Yi(u=vUP%5!`TwtKNly_7k?FdCYC9Ba4&z1u1f!&D@`?HA?7| zW$H=JLz62iY^TslsyfKl8388{?_?vC=t(uFLv1OZPVD^ZBvG<8=@9R(usL!Cai}n6 zN-}tQ7ATrTB91KRh^)uR_jmc67lB)a@TGdg5)R+W5hq|OI#yha3yV6tVr__V7{KAc zr6hTrbIKj&&|6{=y$z!_oqUdb)j8D6?HjVF`hTu=~3kq)4}94@;U5PCc2KNNMEXUQzq1Vt`*(cRs*rg zF&QC(Cq)g$f{swQx&z*f30^V%clgZTWQla1b6tZu!2UgP4HJ%9oT?tgUQ#=jY?u%P zgtil$8}UeMpHO@PUWhp1^g|lAC&wb?9sX`TM!Q!uHA3*fGR)u75)H;;k#!StIWM{k zEo6kaVysuj4vSd1Ojek}?24=-7c!N!mZ`@ehP8~B)=;4^cPn!V1@D;*@As5q8H>4` zxfuH-RtRF|EhbA??P1OBtV@H@9;;3pgzL$~V$V>IU^N%?cT{5;Tcmqovij@N_o9Xp z9YKRE=yk*-xrmy~MNq2sin-XDly>r4H&FcE}&7IbNyxx^$p0a3A%vCY& z1+mQ)2*QiJC1NvfB@8LTC%7hWN5oNIF@TM&t?+%Q=Lxj=qdM=x1iaWn8RPWnRf0ueW9?G4RVN<2Oy1TC#;{toYY_Ab3r=pkC%!!itWH@kK7z8Vq_Cu)!Z~sUB~z887-^8j(5xA?$Dj;^<>m{qGLp zSgO}zr_*DyM422Cmlf=HASh3+bZgq1_ zv&Spyf~oNs@0?I&TGXr<4=%eMYaFq;N&$|Z6-;mqpFCch3;4;foLZVpPT-SUvtlY5 z4T{!C-m^phFuVJNERBemuCrryFQ*&Q%#&dp52m>Ey@)gu!}0w~Z;r)m?LNV!qqHdb zO?fC16OR*0s`1=w%vJzK%JxumKo7*a#8WF+0n?T`9>6EX8Z<{Xmw{rni9#@RlsMOJ zW5$qA;t3S2k1-3Tz;`=DJfBdM!y`LHI8l$kGhTAA{OGdNOUrT`2yHZ$b;H&E!FvSzwVjXLq$xF z)zV9@|C1hMPTDA7KDQ9l)Ja_FSbHz3)7hqw9ia&t#dI?^kJvl%u4vFxJ?xS@0-n;A z&^lh;>0)UIg}i!$PfSxMd5=|&Ox2Av&%#%+FkuQtu@<23v^Dyh#WRDXhf@I*U8*lAESkL0MJ}(bs7+tClhP-#x2o}dk{f5Cddd&ykvrca zga&V&m@|5t;Yed9W3(QZPpc+bdo`${%*lwgCFZ^;g7;)D@hvOUmoU^$jSwT3Gs}n74|r%~*?a^E)PJ6*D+Tnm>;(JYd5XO8l*lfoRu71!#H%QHLm9 zFTQ4?w|3x*JW>fcNPEi1ZWB-DCb4~o7>1_JvIdk)VJJ-yYB0jVPsZ!vO#hH!dy;#X zwY`}8#4e0R#Ft(QgKhkU=(ZL}Qz;Y^2n-NrKJpY zkHY;OA&8d(v4GGD@jdMheD^x(8{ZR3(0gcH_+)Ty_1+By)q7}^aN@OJ^nL&7_^ABl znUboYTPDrFIp&?bp|8=~^RMRo3aan#t87{7hGDfMzk1(?f^G-DGynCVo9m#eozxsVf2smO<`PKV;6jbl;0gJet_P-Ta zdN+>Buii(ZppWxSW2pEP{37u5t{s(D?=4YqH|Hb$Nl&FeqqRQVkWE#7^?nltH?zX3 ze3f6-|LdH82@6#3K~Ye>e~9v*fsZr)cYvcJlzh#N1YK_N7r%H)zMSp93uJqKw_EZn zsPtD-@XIH0Dfm64?D^IER}@ryoDIL=ke7lFJMyddv?yplbAxiKdIK81S{H+KEidtVg9&ytfar~RJh{Hp!63T9_G!bt^G`>FKLa6{#*{O?srih|n| zL88s~hHz4Q88`O)>U}c`-g&a(l+aoJ>yG^Dy)+6sL#(s@e@1@%v|98HDnY#$U~mnk ztoD*A%YT&~nNXg7+r?qReBZzk&ae2=<g$%4hpg|D{XGQ-1IuaAZG~ zU%mh9*v-TZej?f~m0!UE(Czun-$$;MnN&YlB2+$=J_nfzr^>JBb6IHp2_cYw3${+Zy +using namespace std; +int main(int argc,char* argv[]){ + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + cout << argv[1]<<" ofccl : "< a; + vector b; + string line; + // time + getline(cin,line); + + for(int t =0;t < 25;t++){ + for(int i = 0;i < (11+ranks);i++) + getline(cin,line); + + for(int i =0;i < 6;i++) + cin >> str; + + a.push_back(str); + cin >> str; + b.push_back(str); + + + for(int i = 0;i < 4;i++) + getline(cin,line); + + } + cout<<"ofccl test algbw:"<=Toqd?Wv%$C+SF^N|x({ZqVp*i@%29%(ss7IbjX+D@bL~)$~%lZ6f70aK5~1 zDaQ$K;hx?Fy_Jg= zEGjRFMM}y92kAv+Q1h)?(_%pFLBg>((H65fP4rJbGHaXn(~Iudd(A*i@y)%Po|^Tg zl}ZMcNj!8>el~%XJ|B0)BkTYDtXo{O(DMxALVV`oQ#|*!^M->@cE#?Rp4HysFPyz+ z#tNX*UJD?6W*W+0Q9wHUtPK3$&VWBFgB+?|I{9@O`0HVabo_}7{GZ9d|G!}5baG-D z^tm`gy^dwTUyz~Pmow;jR|b7HXTbk1gZy7*(ElSD_%F$Te<}n1t_<}hzjNc`yc#p; z|J@9D>N{?y5Tuu7!2cqHJ#UBnvy3@LRkswT;@Z8yUzkSzP&;F_fu^{}6X@_q z&3M!wiUZ+o^5c)U5rl=U{#YnrMq=@3(BDTLfcGu+X*8+>yCbuNfQR z)Os%rgu+r#WhfFeAtP8`8xF;5WX0ArMLJ{NhLw#?ktUOD1Vh7?vXxr>ZKl7oGZKIa zU|zr3iZ&0M0e?Ku0oJz825Fnc?ZG&Pl3+UXjA=hOjidcCH%m(MneDy3W_K_ei-i51 zq4+j)OR2*`sck==Z5pqZ_7z??*6GrM0u z>%Xb>Ie$~UJP_%LbK}8*t)^>=N8ej-n`mnxgtP|&W{e_;$1q#MTSF+mF&OLV3Q~Lt zsL+C_64-18IyRe|{Gm<*afkd>n9v^&_J-nwqDpLv1|!`;McWi2C%6$2$0TVSoVxtT zylTY?b74uTv8t|i`HFSxa9>_hw#dM3_3~QNQ?gK}==UXM9+@=DO3JOfvXTnK&}__l za>d^q{LRLEB^z@Y!peVH#&m(t6?`;v%Hz3C4qz6r^pTefGoBhG)Hu;_@wPLF{}E)kc)qUY^J=^?=2I@iIKuOFg@3(R;*UD;gYzUlkH;0ouhNGj zl1_6zx4ad9hc!H_BJf8v{5cx`Q4P;a96?QLc(ukN!U+vOSCey6!+%J_7qb4;7DXCf zjmM-T%^B3C##h4YH1GTUW7$ zXIBukQVp-KJ1R9im94HS4X@ggXg&=O2ePgP4S%|YHH>u{9u99^8#Vk43u_o18XgX7 zUELasAeamO=vV*6{lLdXI+xfX2U9!_&G*UHdfrY=s1T zK*OJ{;rDC!4{G>94S%kNpSqp0z?221EHGt(DGN+lV9Ekh7MQZYlm(_NFlB)$3rtzS zFkUbC&|AK~SMq%Qx&OHq+ZDS8<5{D_zP>|w2Zeb?D}KGkFh)i%dKRGIe8JyAl#ye< z8yy|pP5C=$17_sdQ&yTbUPg``u+p^QGIH$uR+=_iMvi^OO4H`f$g$5^Y1(iZId+Sc zrj3@7V>ei7+F%(u=C{(cu`+V(qgI+WR7Q?fTWQ)z89C;$(zJmxa_j;tO&cd8$Ii0S zv|%!G%w?quMEbRNRefoL1ogMlv@wGETWQ)5LH(^XZG@oyR+=_IP=6~;8y~2@m8K03 z)L*4x{kg*WHL|^sZmFl%u+#O9wAYco%8{;cq!&2S7dg^%9qHMQ^qG!yt|R?UU8=rs zI?}H?(l0sEzj360=}7;~k$%jPe%O)zS4aB0j`Y{{wCB~@#8cP%5-<4rUOdszTsM$A z4O^YQfirK%4wP@;ssm`Q(b;z(%Nx5Xi5uTQ?vDs8IT@dgt`(+xbgk1yj~1N21MJ2@ zhT#94;IH^BflJ@@B~JJb{OKdU11Gb6uAlgxdN)1`0=z6BZ}jLU(e5fg^}iigy$bg* zdM`T868cw!I^)O`xy9K8<5v3pYk8u3L@RKAYYU3 zmREe}z=?++LOx&OUmpNy^Y!OkR4fF0lb%6e|Lj?$Q2#C>`uYYDK2YTf6FbSi#87e- zJQqAip^D8@D0!}8o_T^NML^;wu*Pv@NwJg`ev<{7FFeJKSc+4=sR^8ErO z+3bc8D%hVweTQ}pTP4$7wnNt*lCa~iR5FoKa-XQYOGHUDRTP`9&~U+O@D))>32Lrz zt<_xfgu$NXQu=q%oiBh2u;xR_$I0$X4i?-oxN|U$mX#dC7Kc#w31SkC@+f2{2cVKH zD%&n z1t-i@3KKWIo#;8)chlRhp0ZnRI!PU{=TbE9acZ`?q8hVHR#9Cmj*w6E=N3{5f*VlA z?#C@*Kc=jY(A*_?4+5Z8O5&!IzC_OnU%&UnSnTexC}Q;WBfxo2QjlBnXlxpxj^_|Y zV~6%Wl1+Tj=Ux;IeZ(A!LNC=$r#GI0euyS0c6~%vsWyrrZJ^Y9*Jd9qp)`94_v2~y zXPN0}XDec0=Cr3+3HV@(!WV0bf7z*mar- zhfS%6YVUqkE(UpJl?__k+dhX1q9GvlzzexP*RbY4EK9?v+ObF7ipY|-^dYzsH}6nY zo^~&u^*LM`N!=&imSZJalD@v1P8jhIN~ke*j zlDwusL{E4EO1ACJnVotgcG)wRlQeUIkO4Hs06IrP6zd~-qOa7Sv?E<^@{Tu0N43rc zOFLvGX_}6PqFl3Ne9TR*Psz0(K4NIrY(f|BfI0UeyzKQQTK4E7#l*_H&o?l;84HFV zQ&y_)PqBLPCA_5iKJtu&_prQ$VU^)L~!ZKM8xkEPcPP&%58Pa?%V+%lUia5gh@_^$c^)IOjYb zQP~DkvwcToa~8NSU4c!hIiqO{40F!p>)`G?;X++00>BnS$(cd`c_Os}Dksd7Pgv1b zEoiUCLf_>rhkXa$$_1NZ`7%wABnzxx$hZ4_2@xX(rRAv+V0i*4t)Xx?ZfGc$6Y+wb z{|Ns$){j4OFda&|Mj*BCru|0GJg{>6rAiSl@j4tzgfUTxdNoXhJ#RC zq5pshQ1YKq4iDhThp9|CY(I+Ih_M%2;+Oy4W+lz`IDQN-@c+o2$$m=*{M ztsJQ#W1EB13XoPDsBWolre0SJ2}jWD;7Di=(ON@E8bE6kkNNsH0R8+UZWuH9=RXtf zP+j6rb%{SFA3lNU%3RWf_!7?(?l?H=5=ruaCto9zA1c1CdVTc+qMS$6ouP{r15mOxG(kF?z z$&Zm1eTRIy7szHG8;{ZYb}W8~LQ9%hJPiVEhalS(!FFhZ9?}?aDETjFJYRnil5p0c zyKDJYdF%9cywY3%D#zaRp z;4()B>99wfPP!kVkFIR=`lR%kp=1qJE`fpv&~2kNkehfeVgLiQbUBWkCQ*&GHaU~6VB)m8Y3chn_&Ir$ZtN;u*Hh_+R? z8k8%J(F#<%y~$!$Luh4D2E%KL>uoeQAkOYV^Ld;fZ!iE;M+U!YEgt|01?o_iO%lRL}jzVfoEVM+YM$dzg zgBhaM{M?Q-yjn}hLKLM%c_EA3qaW)8PLl=O-;Cgif85#FpKigU*sI7q+~e%PIml!&$GNAUzZrc)UPJ-$Li5p zzxDMUborM2rsrkUF7LVxG|8*JzS=yv$u5$bj2d1zFFi-Io}3Adrv6M>V9Ekh7Wn_x z0xtZfMP2F+gS5b6>jS^Iv44Y8h5H@&h#y=KGcn*gCBKtN%R^Hl5<_1BpvV5x@u4zL z!-pQ@pMlQ|e8glGf1-xpa7Q);0-f%qZbRP0gR!{V-`U=}b*XWgb?L;`o){6g1OxF% zbm>yF-o(%A=tp~IC>#q$}B+u^9Gbv^xM(VLEP^-idhHXK0y3%$tZeZbaWrm zuHTQ2o<#ayj7)JTcO39ez#I(0MZlj6SP8fQkbY0U91oJhXH9HuK_ciBw#OK9t?I5U@_qRGy?*B60m~|iTU%bfQ5kj z09OMZ23!xA1pF*u9*lh$uo$oi6L}xt8o&;~>j7^CY{cy10YE?CFyQ9_PXIm)Scrxi z1}p_!iPiafzz)Fefa|a;vKR0^z(K$y;8DQyG5w;(?FDoLeipC_@CCp&z!_L)>;zm0 zcpqRl;32@f0AB<=1egQIcnfeo;L0~fM{5AL1GWKv8E_}y_W|z%90oiDxDTUBjdWU z(Qx$^y5`K7p0^tvjqrFXZgjv$KTMo8)5RA4eB74f)9@U8hdN|_;mZ7?)dgp4&D&vo z1!?)eYH)00_dNCF7#FTH`(-8Kz|N&@jEd^zs9D|fIj2VuQ&yr)NbV|tk` zKV;{B8}x4EFVN{LZTfkLouvN~o!)KJuLS+Zar9dcI_Y1c=MUQXw{!lhbb8#T-vjy{ zE!?EKGx{ygaE>PI&JA?p7m=x+e#d<_9t+1Lwz{6m&i z&V2am7cq9ElS6vY?`IRBXR>n}=y!vDnXaeb)^jK5Uk5#1I~OC*da?uPJ-E-Ar|DTc zfNs;5n-#ES4MWxo;JrXEYn`OKfja^Ei=dy->8Nkk3$~0|7`q<8SXig?WUUYkRNr#Y zp98%{r$YvLKGB;&H!wyn*6CqeejM}_vbz62{PS(eWi^_NiJl_J3 z_?@)syDfGdp96g#=$UNvHs}wJlm9$~{f9s=(sfvC%fAxzAA+9FhHm6p5BfJi-=gOy zKkbk*<^xZ5JPDo$ai7zKK&0wHeBZXq+6P$)%onfIWo0!A2C~Ir&<}xrg-+MvLK5_0 z(5XzT3~j8*!*KjG=(BZton3x0=#PW$)9KLhMzB!%KG0tSJyR_206iaj_bGJbk8u85 zLBAaIbhae<`#}E?=qddH`9H<^4}-oM^i29ELBH_X(NTO~)~8Gdp@+Kc)~T}`c01Ocy=)Yb?yaE1*R-8Wq~OROj%&c0#g>4vcQxD zGFd=uB;flPSc1}}-m|CHpiW48q;x4fy#tW084}GKO!NOpEn6Zq<7L=SZ|z zN7Jo(2bp^BAMF*;h3yV;z4`8Fgl_g?50x$~f5k=nE_7iVOk78pj-|P{W-=olCy7h! z0pU*UFEEY(XERQQB@#P0;wt7Fdfr197iXwJZKZSsJ6pSn|Gsas;I7&b86 z$grE?4u-oK?q&D@!$F3_439EA!BD--u&TQl3whm6nv2mv}Cfc$FhF%tr4jQVp%K(c3{Q z)8ey@b!qWA#`d)MT&o{x;?W1D8LI!J;-`!KO&v$~*>F|me>yE2-md&M6@R*+`bR4M z3`6y!R6Ly9tz+bUHXi!A)8g}uy=mGv+bA#|NQ*DD&TY_3$;KP3RoqI&V^_0E$H@C^ z;{%3zFTYOB#?#|HD|C$FpKZ9;q{E+W#Ydf*jcB9dVk*AKNY^j3jdQFxmdcMP+pS~d zeYW;KaGjcMohD&pUAWCLUK_PB@{Yn#7NU8>yi^K>N5b!L;HfzXuf`jdi=H*PpwIrO z#87#3(enjY8as3WPjb}zWQmuqZTPs1vG&{pd^-KVC-`R?c~!WT3~ur~e`aX-g+(4qfd1%9S6qY5{;RGhvL1-bCX{lY4Nw#qGM{8{g8q9 zXFBj7XZ$S2EBU>QFLL1Nx1=QJJO}@Ts_!h=@IPf-^BR6 z4*WjG-{-(T$@qPYSM#|yfTy_hjbcMw;&~PB=3@d$cs1Wv^I~+bv~h{%w=Rq$=dF=) z^uOui*h(=$9XlSKb0L*Gc0B#KkUw@DX%%wL9y9L57_a8>N}o>)Ib+A2dw{~1N=dBbD3DetUvBB8+$T$L}%zNU3D( zVg7koaFd)~ZkLN0zm4(VE&@opbCk{l zp2}@wIm&KP#y7KI4!g#qjK7BKrR@0+@CTC9%lvSAaox`NgO2|Gb>L~dbBFo0Z=3l!MLUAdcb1+a+y`EsdQvLc~ z;7L!XKYWY%o#Xq^!(48ViRS}P<<4OJ(f!4B3FD8jpM)5{lks0D`&>L%mAG%7sDA(Z4tA*Gt%ns zG~4hirkOr>KoaZXgo^hcxqZDueWk8Z;;QvR->+1At5 zwGCOc6n-uhil?v!I)gZg4hK}l>DW2?T~ZOS(hBKyYg)7BmBqFwQjV z3E-eDL0Tw&e&vUv5Z5k|i$d+;NEF9a5l5wIh9XoPl53XWc)f5BonBX0=0PQboBTbU zaT7n_+H7`5IzxeN@>sSyPbp}@sGx939@NN;M|4!Fzb(=&vy*h3ZB~Kv_=3GsOMf^X ztE&hFa0(nrO-o)Bj>LLeV;Sj{kRR&8*;^So76&4oojAQKBZH^ZyQ03jDG&{H$1^fj zi1Y3;k}HF~aU5wDY|F^8(7$zaMq+t;G}xVyS`i$V>?t#;ZMKCYT_JyGMk$_!U365Q zYSUBlg(GpZHF9cs#20*ze65kK@2TA2IDAWFHT+@WifNt3Q!aT(wQx`8ICWSg8PRFt zIDIzKliATc%-S6dZZZ8^a6DY*nv`#fL?QUprY>d1*f@^kQPRQ@beryRYQ31HZ3;zW zaWfL`9EW$IIPDSwPti9#%opF*Jx&iQlU%`Az&{=v7ut$L_M*W}neE4gwno8tO0I5y zG#EZ57s(o@0_Cy-IAih@jI3JJA8sE<9CO9G$1SwNDipmx=+7K{Jd3y$#2|xX2*+ty zRWzMs$W}pwZwtoZu^1@KCJZfGa9UxT7)yd?NQA6%1bZG^XedA&3Eha11_!PBaZ+L% ze)mg53Z+X!A;Vl%w|04T9kTSanzg1l=+HD*HdfbrP4Ai-I+oSMIm@Qk$4FmIqhZ#3 zd`)$I?F!IR2Di3UggDsMDq-!)l}%n8u3Ejk&dW!TdKTFu8XwGxV?8q;+nMp?O|REW zqZmsI;+V}vW&;k@?BYWntE$AIjW~aiPF)m-ks{JYgBT1koS6RFhKUr4LZB{+@p^G& zY_r*fCaE+VaVjhifn{b5nLDzL<`uPdt)*TsMrF|ik!=$z)9c+L3KEt0pEw_tO{dPg z#C!;&`ve=Qrc4}qOM$-3yhi-6TpS!bp(aN4c6k7--*TJ6fmRraQSfrC*RD zS8L{ef2~nYhb>c2a-N-BRizz9E2-){TW18EK)jQUP;REwoX)nTcsj9TtW!kE)}*t% zd&1Vp6~x)Xm?_EN=~<{~5{Wpnuq(0!Bj4ZUb6x~)6~b4TP0KlaD@UAwspxcZF)l3W z?um6E#$f=516PpbagHl@n8R#~Ma&M2+H?Rq@>QzgWMchyOGC4$ndYaH5f)Mg-Ujqq zae}p0$otzF5LP)f{2eEvrx;$kG>s>d;#^P=Mv=}9Q=~ghKI6QGBGCJ)n@8!Uwbx`g z#S&}pg5vmUd=SOS0c*O~Vy?4`9emb$dsl@1Ck*lQ4) zo=(W))vQTdCr;uHfj%Zxn5)Hnj7BA~SzwJ!=Cy5|4XJ~O=z!%k(X4IE+}s)c$Ln27 z=2;XCin%Jry&$%^0zr6@w_I$-t%V^)_ypJF?T9$)D+aK!wH3Y(^*n(NJe1&Fn1B~s zC}X@n0EHn1u|CD0p?P>~mlL(7DbgA9Hmq!HiZq$29RSZtf|2fEx{xxN^|#ln)WXhL z>12#AnA(YT@T5Aw-H&K3FS`P5=wYZ0bZj;^Ve`+9U*%W?HrKCETXo{W%j9jXU<|8Q zy9Pn8u;3&`{lyTR`Z~c{yF=>JfOLxo3OGpbh zVp~<%_@&KQ%mQepgb4v#(b=Gx6(6V7y1P|unUvw1o)+L4T1Pap)gH#-e2YWjIG#yj z-%hO9@Eo_cwXW6U6?MVXc#L;Ws4{J8R*VOiy^b}G*j%LmhgXVE^@t1;!}0yg zY>maVcAwzVQCgG+zRFM}CLSl0*W$U?7_WfYJ43AjGZ5<$Ppx1DOj{aw0G||V&>Y!X z28z`t3c=7(?p(W#8AINWCs43H#w?fu-|Z0bd_qwUkL(cPL_Pk_c*((%-5+1EF!As$ z(2?2!(rgH&GEhrTMq^d92HQhn8DlBVP2$kE1VdkYFm5(?tI1O&I+?+IA#5k)L2bB} z>U^f(^i1rbBBsY`=_S|yDGxFyZ4|JcTZn1uB(8Mqy%*K#v?*j)Xo5zu+>Fg5_Kv)3 zn#^<$yX205r=l&iiI;b#SlU4$ui4}i)6_}cW0fOQZ8OcY@Kr2Kn1WHP1?a17&Hh&L z%pm2V)MTu9Y)d0SAl_M+VSjtN@D3eJ(%Pv8p=*sz5Xf9njF zc3o6}rbiGBh|-PXYbIuU7rw|Nm5_t9r*iBz@nmii>pR3SG;NkOp=1g}8G=w#JRE#~ zydKW-}|4AkIG+}EvX8+Wzzbqeut(I^s@^+|2odEp!y!a;?Gex3~L?v)q6h_bUXN+ z`9B6aj%|o2{ndMR6y%==Snt_!>hDMXGw@OQkFfpOVu!zfP{~Lg%cjT!2>b)ci`Z(V-hKf(YPXSNw+)-)u{t^ZEaz4_Z^i=vY+8DqM z*;M6M?>SL$2P>?~SNT=_zrgvIvq1Ge6b03Lh$#O#_&D=_1vn}~$v50c(B&3?@yn;w z%h~?-fUM_tyCuJZN`ED#kZ%PX_u3f7A6zXTVAr_Ti zy-!9#I%`M2oaH|YqMl#9cSgZmPuFwX{-dxjIr6Lb(c4cUe98~L z4IJ4|>$OUTPUT_BW_PATH + echo $(date +%F%n%T)>>$OUTPUT_TIME_PATH + for n in 16 + do + for w in 2 + do + for m in 1 + do + for iter in 1 2 3 + do + export INPUT_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" + ./clear_static_ofccl.out $INPUT_PATH $OUTPUT_BW_PATH $cards + ./clear_static_ofccl_time.out $INPUT_PATH $OUTPUT_TIME_PATH $cards + done + done + done + done +done \ No newline at end of file diff --git a/test_scripts/ofccl/clear_static_ofccl_time.cpp b/test_scripts/ofccl/clear_static_ofccl_time.cpp new file mode 100644 index 0000000..4c49834 --- /dev/null +++ b/test_scripts/ofccl/clear_static_ofccl_time.cpp @@ -0,0 +1,37 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + cout << argv[1]<<" ofccl : "< a; + vector b; + string line; + // time + getline(cin,line); + + for(int t =0;t < 25;t++){ + for(int i = 0;i < (11+ranks);i++) + getline(cin,line); + + for(int i =0;i < 5;i++) + cin >> str; + + a.push_back(str); + + for(int i = 0;i < 4;i++) + getline(cin,line); + + } + cout<<"ofccl test time:"<qS3QgHXf0zt+ZHucbcP)cTl+));LsZK?Tv*WPQN zbLM0w)O74WzigPSv)5XC?X}n5d+o=0>^t1Gt1}!9O@&PDA2flKE}$wp@u<(3*~HGrUpKhdUCXhs!-ClmdEu_>>E5pt zVl+Fvhs&p+%I_0MM|SyZ1DjK8;QUh6Lot_IL24%yuILD~l$T!7(Rz7DAlTh=c~3?8 z<>jSCkx)^Ipp#yd2UXvib9n zvrx&PJc)+_rDqcu@$+#-JhJ@%mT{+J9(tapU4Y*r{0h&#=bT~xGo6w9W@ofDd-Lb- zp0f(*q~UT1pPPj8XJn8JKQ9IUw^QKHNFj$xmrQoJ>w6 zg+3RhDA#BT{K6FZzL`SL`%>t0OA7pNQpo>B3jME1!GCcI{Ie&0Q{Qpgg&=-a3jEJg*z@y{e}=X|tL&1(lwbQQ@E0VJ{~wS;{CjVe!~(=A+{SY1 zZE}WSAL8$2d_MDQ@DGt_tM9VQ$S0I*y580q3hI%lHyqVPn)Ie(-QBoMZ&S{@-KE17{NAL27BcY(T zBM{xDZz;A}D6#IRvQ5KUX>eqqDuX=qfA z*{(lS?Q{CZXsIvM9p%bH12(FzF&h5sdfRks3ofM1=hGwPL0p>N9NZc}_6`0>cc-8H zi%+>0cop9*y085feX}>vp~3G^dle@1M*Tg3D50njo5TK4mtWB~2dEKz8XiX{Y3!Uj zy-2)f)hc~yQL(nBwr1t34I6M>T2xZ5;j(IFjqWO1YNc4Oi%MKFs+Sa%8doJnW$>Yy znDu0fzghU3iTO$<<}!qp|1z}M0-r7TXy%l|bDb=}3}EReg99_3YDCmHQGe04(}@2` zW^jO$Trtt>@!N9zevDYNv>{IOU*Ee08DwkEb3A&7aCy=}f*M~5Zylf2_)2&-jYy)#X~HiQ82Klk^_h(*<*h=21&;<{1g8a$A>9at z7CbivK`XZ4t?P~o3!d^;q0)j^bxAaj1&;<~gnA49R0FGN8!UJ4K{ z(O`|xWx?|jRnU4Y_&kGK(|Rp<>QgG*Wx=1JkbpZacAvqqdYUdBfc8gbfi86W*OBTgGF;gE;Qn_aWX!7 zh7qR?lkri95ziCxcRx_&r4164--y%32+D87X+s3%H{!Grg7O=2+5kcMjW}(5p!`Oh zHat*%6^HfD71poTg#8h3uA|y8<8`*U+ZMmt7B92KFSo@nw8hW0#pm1Nr`zJ$w)kIa z6ZQSQE&jGG{+ccRvMv7Kw)jtN@gLjbKeEMtXp4Wx7XN2!-1T-%?Ae< z)edBrW9!K?aQc&|D9^yv2T@%U^Y2BH_jghh7oLIa#|bPy9-WV_6{Kr)ty3mO^3LxC zyLN~n_`e|d%kCy{$p@a;nCIZ{uJIf^p6PKs?RoZt=sXB;vw)n5k4AAQ7&$_Ts2zJL^gP#8R^GKoo9Ypl>4Z?k($YnZql0C7Z z_!_ia@QgqelP6#DT+KX-1W$s1*we7aF(isVPhtnkKE}L5LX)RphjU3J_+r0^|D2G9 z1_h9w_o$Zl#9&{%IJV;@Az+7-GJ!&`%jhPKUM!;{1VH^c{F7&3hsM-HGFr*eZ%Jku zB@F{))aZ$ciJ=`wK!a7Zz8}$5LEh)dX%6l938VXk9({wSE*Gdf&POh$oP{K33FPz% zs^i%3(2ik4AYEmN6i%TC<@^rW=2Q^+1f%2FQP02^1$khHEX?y%o^trtp&ic|qUb8q zmidDu>U)1CQACEQK2aLSXfYZCZ2tguQIX}wYr(e$N_t{s|F6^-_HPgtILf*7@1QG> z4;ry5L-8W$;aPqt@7}@fgHcqKefmL%r$Suh`) zZ3ACqJfwOwhWGr~?I&X0$NO$S;pi^8^Y-J^YP&B% z^*&0~8yC9F|Kl1eYuPt|g6&?U7z8f>ap!*+!k(a{Ptt%N--Cu?$t8CCaZjv!%+v24 zn~dE#8AXhqes~4PB#n|&~a((C}Pr_$`1sp$RnhAA$2$`c#L0FW&@0+|Q?l?wk1 zm4CM-$D?S7Q|0(g;*Fu>k`RxSg2G8bU$zJe*o>lVZWyMX6IbS=rlPJyiOpjya*Gn# zUzy_>sCP=W>vN7>Ag_kv_X>>;dis&ZEYn=H*o1PZ`tDcxVrW!Wv9=Af(V>8-2uMBn zO18%_Y-?}G%7Rg)6ABE)?}7_Tn)?u%5?Aj~B~3bIRS#8)J7l&tji|LvI*o=EYmR&R zZXeU4AD31`w7*VSY(a{=6SPf~WtdL*&pA?nrG47C-EnF)dBV-2B_LQBjm?o^+7e@rx2t z?MEBYEY@saA{Y~ImGq(b2Q+rl_&^#A#m~ip$kSg8D^XTQ zJh5LA_Pa9o{hmJeey2)#I5FjKiAQtwua&~fY!u*?CHM|=wChQgfZiP_#eU@u8sY!HugsRv+rWQaxQ5? ze6e2?W7Un8TupV&nL#Yy>Q_l857>@Cr$Oknf3hb4X(;}j z=#Yn$#{c|B*!Pk{p4bb(3}OJ2Q&Oyu(1*eP#u(^ax^2XZuE^1wMHKB~0A64KP&$Yw zP#WrI4j@qd%zQO5FL#fG=PJxvQUsVSxeBu!7%|5|D(fsUJH8NBLo4uDTY;?h$^IC% zZ~Ec`X8Voby8mgW{T0}$P$7Y_JXw(xRh*2uZo$T`*wo7D|lYfcO?Q)rKLPUp8ZQn0*Ccg4r!P%YpxK)G%xy+|}K7?nJ#~aA4%bgsbTuQ>uL_S*nw?P@4H7 zJ!B-#7K&h#1bvfeFv-z0O-`Y2?g z*T-d>!P1Kg7emGa=mi7RuS8FnLr$HB;>03fg`BLbmt?9XT7gwOK^3>$J(hPrEvK{N z8EzgcQRD>n3 zNtPsEP=V}GVjjU_b%V(Eh*5i2xrd`GfDtx%hG#bH8lMzVn!*iAx8RBWEIurZ;6Ca^ zr;Iy~JuV8-ceujhDJJKtavXr4F^7;5h(bj!m#Cq*Q#2zQcwm;Hc%{hxS9H06C?erd z{CTMW@_v{&k6lHkCZ^0csX0(*q4J3q_$~&i_@9B7Jcr_W$G{`^3+B;fqT;X?wU^_t z))>ZCk_3n`f`;oDP1sPwMx8xFiZI76H5)P-|Bn-Q4U8%k>q&GIQME$JLy&_RqNVz| z9;xpbOUQgmF}be6VJd!dGy23LKucHrtmC7M=sT}|xys^*&5@2i@sVkeqC1D78Fy0P|> zzd((g_|IO0{DXhVjGpg$k<-`4#xd!uj=fPeG4I!&zC#Yr@|U~cM7eTqxRIuJRX0`X zhc<66$~%8MWhe5ZDi=+X?mHQGngn^~&x{3TEHGn%84LWKEa1R9Eh?xhOg97;n>cvm z#(W2-64$->i5D)2nHUgG%5P_~q@l?jiJ?aT=(hiC{3y>;@T1%Kr{OmTKQZmbpRnfD zoT1G=Ux)KbrzS6={z%jr4Rrdi)GjrGw#Dy@hQe1~sn_XvmyX`h(*wbXKOEH@3o@^| z3;NeX?vh_mOdP=VLn9LtuL2$bJOQ}mm5GT50q6gAV&Z)$`Nr!L6J1cG^bJfH0P6tf z0q+AW1>6GI1o%zBDB#n8y8vGYd;~BX&Qb^$&PxD)VafDZ$H9QV|Q0h<8F z0KWj3k9P7PU@_nrU_Ia*Y!N*IcpKmeK=m$+rwP;P5MtjKO1zR zucB`@>2HC4Dd^&TFh#%4q|bpqQ_+{4gbwkV=a~nVAPOg(6hWUC8G*==Z70@d)T+phLbXqs1)8bD*C9Jy|)ONJ4fb`X>Oh z9)NX~9vL1Xgm`G7Nw)A1&z%;YJIs7mz(1`6&mG_~Wo$9Zu@Us|f_|};ev_I0Hqajh zo$O+yzrm#M0sV2%>2W55USrY^fc_BZ$;wG~e2M8;LYV0TX8IGLKY{dlR{Cm_eh%u1 z^j~hJcbW7RpdXn+zm1@i{zcaGeltDom{aEls{vQV4(6H0R2oX ziOQ|?PBZ;Wpf3SES^3E3kD&bLfc`#U);Hj6RXcE_J-(h{jUuAT6r>72?i=}Dd?BdIB2Cq2DN;mH-Wwm z^kr6h(3Bqq-4FVwtn`d}L;fz%zY2OXdpMEi5zu#nzQLNF%3$$@N5OL#JgIDc9Q0>E zPt`XIVCMe@eT6lTh*^dT(8oc)$V$J_q;CM7<_f9Weh=ugFgHlmc6Wn*4(O@sU_a;{ z(95j(cbNGf1^p(_7g_1ooAl$LdqGcTW2(af%pDp*Pvu)GKyLs&V3nV-Nn}9jX^ygI z3i&j5`5fr0t?6%<`ZGFy`TeV})M zp3onV{xh8Z2(Po;kx^w0|v6L>DxEPqC|D4-Ll9LzcQfG*zMS9Ks=5~a|{*DV20 zJ@W7qVs&o*=QC<@3qF_W&UFrCxpNEe$*#^V{-;@<+={;0YjP`txfNBp#Z|e5D|4MI za|>4H=C91n5#LLQLQW6lzy%orkJg>J_Ga9>b7X&gw)WkOXEI-&r9CklETD)VuD*Ps zW5HF|YuZm7c z@xak{92Pz!^O=ho3(Qzx#sV`In6bc&1!gQTV}Th9%vfN?0y7qvvA~D5fY@NbH<7V4 zrJ%mEr`D!+NPDLg6rR2TNMViyn&y-jr*pjLN_ipH>9|zi0bIazb*}|)sEMGy`$u~S z6tE2@@W)wf(t?_zEt#(p%hf0t>b?L{RyQJuF}pA0BxZ<%3-X3Xdp@ z_--~VZsm%&(i682Mf_T(zI}>UN36PKl{xSkG`1!!Cxs40kfz%kW`_ zgA9imjxZc!s6J+xzfcpw$*`DVCBu4#n;3R6>}9xoXu1h3drAQ4k(Yp#%MO)bD>!1}$@tKW0|M)}1=e1Vp%UuJ4&8GbC0A6~Z0ijmiumhXXEshP%U z5+>Gx%Pj5P2@@l)$PHy+x3qpyB8A)|;d^a(st&@d@kXVhdr%JOvp*~`lph7U@8C#c zhfd&0j{2S~@lx1^pF^8$&)vW$)Bh2{KUd4C#07%N_h;gIu69Z#VNLw2;(D%jY9(P! z{JY|Mf%ce9e-h}xm-XjV5;y4}!^Moxuat3>?{dbUVZ+l44kTx;4Sy5k=P_Q%zn$>~ zHvCr@e~t}LuPT##rwu>C_{BDSHpWNdFSOwoGk%E;znbyIHoTAVr8fMXjIXfa_cDHk z4L`*AN*n$i##h_$ABTS={XI53oeMztS{uHN@%1+R=NaE*!+)Rg8*KO&8NbnnpM{E} zd^g$f^x8k+TW$F38Q*Th-@*6}8=l@#BspC+{Bw+t+VCeB-($mHAnr@d)o!!lYZ%{a z!{5sI?Kb@VjK9l<{}JQwvEhHq_?fJ2_)gwd|S6YsxJgdaB@!#{i{tXkjDNf^ z5kJQGIpQHDJafUqHk>Nuyme_J|8C&(wX?MX+j#ml<}c)YS)7)E1qkVLHd@C^pA$wMoO9AJx zoO^*!#{W_Z{-QIaoZZW%fLfNb7x;xJm->D@n!N}IfKMiW4v?hJp$f_8Vg44zKh5n^ z;h#gn=VnZv_dd-0H*>q(llRe*KC zll&LBzbKxE(cq~+Iu}W9RgXVl{?oXI5GC!KV2)h{nM_YgY7VtFQ*~h5}{CqyvYlC4H zz~J9SdrQPo%;A*z+WpUZ#=pS%BDn}(0X~_1{vCMIe+L^<@tlW&lJFT^FXd8<_6^`k zPkVd#9`oDB_jiC-GFgNG^DqcfKhmz0SU(f#DRRoUiuFhL7hwtGbIW8r!1x&BkCaHn z2F7Qgf0CT1SaGGpCm8=Mw=3@6+8*E)4~r0Be+v9BSr>76d*Sd&pMT(XuG;M= z@J{Bjf^3Xm#Q!ap54R>lD~)SbM!vp{`R(oPKLvlj*2VRk&HOJgzuiAyf%>L=ALsT4 z^NO&G@gBB=@~dw%-tOP3;rB_-=eS~#oe1|a-pPJa)%yp)Cp(vI9vDc@qs)kE5}^h0 zWOlfd`7_xMi02iN;tk-PGHJr!KQRAwY-esRS{v$#^8F;Ye+=s)bOG<=Yb&^$`ETNU zxp`^7W&B-$6qL?u!9eou?cwXdD;}AG|N6qwNVFSo()u*>Tsb}3sr%?;xd_g9Y7Oaa z9ibL)hu(@8pCY=qyGQedI=edjQGaVuMR7UKm`cW^&UY1U8Viozix^S#lW3<+_yer`M`L_ll ze%%)e;$X9GA5Pm6q^07uEHAW$%r=QE53~hCVVqe-92L4A2vMGhu3L^X_JZAXgk5ck z3nlSy_I7tfb-dVhi{2IL2>7q@-6sexMTvaHZr}<_mRn;0Ujj46b7Ls=BI1 zUpUYeP03Uy4!uiBuJHFnaju!aH6_PV@77yV5=+~{{;rhNGXIohSBXw_vn?3v40tepGp_9uaKL20I*Am+LSLN%Uf^WGjhBqjhV$w$ADwRB>TClrg ziZYZ-Ms%7ej-U;7r*3F2X6*|5x9Hw2I3q4~NlG_|!Vr9NQx`L1WQvC4QqqDUbepaz zO1+GwZ4QJZQ9Tswn1XkyIPww#PtrGB%op9(HAN39kzD?W&pQV z6R3-9+-{s3+oU(5N-FdQ91Y7uV2NH$<_>M6$wf_VOR?LHQCU<$XxsGibi22Rj6@;+ z9*3l|>C~Z@m=s}jpJpXhmxz;Z$leU`P;d1{ zy+%IMEc_}_ct43Unn${9V49QERhpVJK-bcsf2%xqm!^-?vbVd!=w6-mYw46;%M_Ci z_{BW56SL_64saHkOs`Kx%jK5{YPffmqj$McA{?)~4E?HRJ)9fPbJe>s(0;^rG>*JY zc6@O{zaT}{TFm{?TBDRsT&A96KR~&%(sClLq^d)0?H+I%{!Ugxsh&`BI^dT4>GaOB zP7oz)lMeLm4jLnu9|sI$rX-!GYpJ41B>c$I&d?T&e1B7$bHi~f7rshwT*>ZRHNs=|XX) z2y(YFmQZLt>1h3~GAv^;mopY)AH@nm%)G^939CJ0AUQTdx?4t+dpW3-zQH%x+p+W2sB4 zUD-fk1#=BT)6;3Scr|OXtP|6?Ltq_~%Jgf+e2hjVu~}e@O#1b$9rcNWi0FW&G|{YS zNL}42+mGA5p3GAo_KUeH#yvl_xqN=KB6q3Sj9U*w3ik=F^zDe)$}0x2$)y!-AL_Y$ z?YJ|+yD&aCwooRudLI;q6!`jt_6*H~TRZKjb&a8ph`WAuLu05>PwW7=Vd4*U`IEVn zbe7*-uTl-$XQk;FUof>3>)^CDezPCZT3&Vq+R(#L>ubM7-;B*a(|?s>5!h6>N^RAN z8!zeGTEQ4rr*;keZehVRdHrRRJ~LQjc1Y_qAlc%9oX;|NRhmnxev`&R8=}NsimYPH z&M*LXdT;UTa>2P7+a3NMpC7lxjK$}4d;|3|+Y(lERE#fjxz-q^djVU$;ntd|2Aiq8 z52g{x9T40Odm#2cM$!Lf7mlTR9dtZzoo4xQ|=YQrqHki?U#9JgId~t1zu@6%j5N{iCKR}lzB#O;LA8r=7q)GA>1jzEh~_eHwJT`O1t)0TQ3 zz|&$4nj;&_K(X3HCKx(O?Q6G5W5`Ew2MX55m<3bd*$&~)r{(2v%MKn+l;dyom+UOL z`Qs4_9XH>6?TH;A$%arO1J!goDyyu;-xdf;A4`5NjYHpZ41I0>sNU42CQqSoI-U7a z*iOiU+Gtv;^XXpQHNBgPm>#R8mt6lR+{jGZC}7;T5YyB&u5`@37uD%3Q^?N1G>u|3 zGd7RdJ94jU)RW!pk~;#fvev+6Uf$_qX$OVedZR~7Q`5Z1Do3W8CYopAQ7lZD{9&vG z==ruLZ;QBRkZ@Bf9V;Hw((s$j!L0%G;s$@DyVH*~$*NWQ(xT$@jV^L|WqNIT@J>ok zV{cXCdzvd}nYzo5#v^yWg$woHI=#l|Zb3tuG#R7yxV&4HX6@CWiagWdYfFuNQ8@2( zF7Zt((?b|)Cr4O%sn6K6G^YG%8ae_!v=)}_Qp{V$V>9NW+<3+WwPHBuP|MAD-~k)9 zP~yYRaB0^?IcT~CQ4cTOARaT(+dA6YTBYNV`_lx-Ez-B#ee7Ws5%?!Si7W z5gjzNWiKV*tFDfyRzx@IwIW;=wT1X1;`eDq*k?ynco zZ%_Y8&~cPQNa?S>x1%6Gu5WyA$F9E@=}*H?r9aC0E4Yc{R7SF$O0VA62mnVcD!ux? zkAmuZK41~SZvR_>rEle^^z{BK1qD4aYUr={6#NY6^z9uLSKl#Fa4)AP{Yg)yKclq) zT#!vwdiA{%1$$ZHS)4$nSLMH-)30Q@`Ywus>U%_#{w)0L=^p@&vQY9hCn6M_;xFES zN(}b;-vhEWz0)cA6;%2wDR>h~1O@*EF>8ADJr)HOA1A{b9Wp5Rur0m%Zi|A}GdL)v zN~hpsw)EFg1?q==f{~x5s ztJb1#P!8&QU&9+IX4aRCneA8UkqPDLT`zVM#`gzCIKARal|B#YsqL%u>ialxl|S2$ z`Y#0~Pql+@14s5#>DBkZj^9Mw;3uLbsPqaJfNo81yf3*)CQ|)eiBRcO{5&KkoXWql zi_>kAu-8`ql@e|IUBvkp8yI;}ut`FGt;(R1t`w}$mW7~uHVL57d(#x?{XHE&rN1iI oDZnY+acP8o-STosz39ahm9Jtv0~g8ChrTX3=u8`Hutl~12iOU-{r~^~ literal 0 HcmV?d00001 diff --git a/test_scripts/ofccl/run.sh b/test_scripts/ofccl/run.sh new file mode 100755 index 0000000..c3a25b1 --- /dev/null +++ b/test_scripts/ofccl/run.sh @@ -0,0 +1,46 @@ +export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring + +export DATE=221221 + +export TRAVERSE_TIMES=10 +export TOLERANT_UNPROGRESSED_CNT=10000 +export BASE_CTX_SWITCH_THRESHOLD=80 +export BOUNS_SWITCH_4_PROCESSED_COLL=0 +export DEV_TRY_ROUND=10 + +# export SHOW_ALL_PREPARED_COLL=1 + +for MY_NUM_DEV in 2 4 8 +do + unset CUDA_VISIBLE_DEVICES + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + export RES_DIR=test_result_${DATE}_${MY_NUM_DEV}cards + if [ ! -d "$RES_DIR" ]; then + mkdir $RES_DIR + fi + + for n in 16 + do + for w in 2 + do + for m in 1 + do + for iter in 1 2 3 + do + export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" + ## Time + echo $(date +%F%n%T)>> $RES_PATH + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M #16M 32M 64M 128M 256M 512M 1G + do + ## Test + /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH + done + done + done + done + done +done diff --git a/test_scripts/ofccl/static.sh b/test_scripts/ofccl/static.sh new file mode 100755 index 0000000..3a65584 --- /dev/null +++ b/test_scripts/ofccl/static.sh @@ -0,0 +1,21 @@ +g++ statics_ofccl.cpp -o statics_ofccl.out + +g++ statics_totalCtx.cpp -o statics_totalCtx.out +export RES_DIR=test_result_221120_2cards +export OUTPUT_PATH="./$RES_DIR/result_statics_all.txt" +echo $(date +%F%n%T)>>$OUTPUT_PATH +for n in 4 +do + for w in 2 + do + for M in 4 + do + for iter in 1 2 3 + do + export INPUT_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_M"$M".txt" + ./statics_ofccl.out $INPUT_PATH $OUTPUT_PATH + ./statics_totalCtx.out $INPUT_PATH $OUTPUT_PATH + done + done + done +done diff --git a/test_scripts/ofccl/static_time.cpp b/test_scripts/ofccl/static_time.cpp new file mode 100644 index 0000000..c079845 --- /dev/null +++ b/test_scripts/ofccl/static_time.cpp @@ -0,0 +1,32 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + //cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]< a; + vector b; + string ss="bandwidth"; + string str = "N/A"; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; + + stringstream line; + line << inputLine; + double tmp; + line >> tmp; + + a.push_back(tmp); + } + cout << argv[1]<<" time: "<>$OUTPUT_PATH +for n in 4 +do + for w in 2 + do + for M in 4 + do + for iter in 1 2 3 + do + export INPUT_PATH="./$RES_DIR/test_result_"$iter"_n"$n"_w"$w"_M"$M".txt" + ./static_time.out $INPUT_PATH $OUTPUT_PATH + + done + done + done +done diff --git a/test_scripts/ofccl/statics_ofccl.cpp b/test_scripts/ofccl/statics_ofccl.cpp new file mode 100644 index 0000000..462fffe --- /dev/null +++ b/test_scripts/ofccl/statics_ofccl.cpp @@ -0,0 +1,36 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + //cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]< a; + vector b; + string ss="bandwidth"; + string str = "N/A"; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; + + stringstream line; + line << inputLine; + double tmp; + line >> tmp; + line >> tmp; + a.push_back(tmp); + line >> tmp; + b.push_back(tmp); + } + cout << argv[1]<<" algbw: "<>c){ + if(c == '!') + break; + flag =true; + flag2 =true; + for(int i =0;i < a.size();i++){ + if( c != a[i]){ + flag = false; + } + if(i < b.size() && c != b[i]){ + flag2 = false; + } + if(flag == false && flag2 == false) + break; + cin >> c; + } + if(flag){ + cnt++; + int tmp = 0; + while( c >= '0' && c<= '9'){ + tmp = tmp*10 + c -'0'; + scanf("%c",&c); + } + sum += tmp; + } + if(flag2){ + cout << (sum * 1.0)/cnt< Date: Wed, 21 Dec 2022 10:27:58 +0000 Subject: [PATCH 071/109] +order --- test_scripts/nccl/run.sh | 7 ++++--- test_scripts/nccl/static_nccl.sh | 5 +++-- test_scripts/ofccl/clear_static_ofccl.sh | 5 +++-- test_scripts/ofccl/run.sh | 7 ++++--- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/test_scripts/nccl/run.sh b/test_scripts/nccl/run.sh index 0e63f35..8b92e0f 100755 --- a/test_scripts/nccl/run.sh +++ b/test_scripts/nccl/run.sh @@ -3,6 +3,7 @@ export NCCL_PROTO=Simple export NCCL_ALGO=Ring export DATE=221221 +export NCCL_ORDER=1 for MY_NUM_DEV in 2 4 8 do @@ -10,12 +11,12 @@ do if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi - export RES_DIR=test_result_${DATE}_${MY_NUM_DEV}cards + export RES_DIR=test_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards if [ ! -d "$RES_DIR" ]; then mkdir $RES_DIR fi - for n in 16 + for n in 32 do for w in 2 do @@ -26,7 +27,7 @@ do export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" ## Time echo $(date +%F%n%T)>> $RES_PATH - for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M #16M 32M 64M 128M 256M 512M 1G + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G do ## Test /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH diff --git a/test_scripts/nccl/static_nccl.sh b/test_scripts/nccl/static_nccl.sh index 761ff36..9cf7ad2 100755 --- a/test_scripts/nccl/static_nccl.sh +++ b/test_scripts/nccl/static_nccl.sh @@ -2,15 +2,16 @@ g++ static_nccl.cpp -o static_nccl.out g++ static_time.cpp -o static_time.out export DATE=221221 +export NCCL_ORDER=1 for cards in 2 4 8 do - export RES_DIR="test_result_${DATE}_"$cards"cards" + export RES_DIR="test_result_${DATE}_${NCCL_ORDER}_"$cards"cards" export OUTPUT_BW_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards.txt" export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards_time.txt" echo $(date +%F%n%T)>>$OUTPUT_BW_PATH echo $(date +%F%n%T)>>$OUTPUT_TIME_PATH - for n in 16 + for n in 32 do for w in 2 do diff --git a/test_scripts/ofccl/clear_static_ofccl.sh b/test_scripts/ofccl/clear_static_ofccl.sh index 1d70cb4..dc8646c 100755 --- a/test_scripts/ofccl/clear_static_ofccl.sh +++ b/test_scripts/ofccl/clear_static_ofccl.sh @@ -2,15 +2,16 @@ g++ clear_static_ofccl.cpp -o clear_static_ofccl.out g++ clear_static_ofccl_time.cpp -o clear_static_ofccl_time.out export DATE=221221 +export OF_ORDER=1 for cards in 2 4 8 do - export RES_DIR="test_result_${DATE}_"$cards"cards" + export RES_DIR="test_result_${DATE}_${OF_ORDER}_"$cards"cards" export OUTPUT_BW_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards.txt" export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards_time.txt" echo $(date +%F%n%T)>>$OUTPUT_BW_PATH echo $(date +%F%n%T)>>$OUTPUT_TIME_PATH - for n in 16 + for n in 32 do for w in 2 do diff --git a/test_scripts/ofccl/run.sh b/test_scripts/ofccl/run.sh index c3a25b1..f7158da 100755 --- a/test_scripts/ofccl/run.sh +++ b/test_scripts/ofccl/run.sh @@ -3,6 +3,7 @@ export NCCL_PROTO=Simple export NCCL_ALGO=Ring export DATE=221221 +export OF_ORDER=1 export TRAVERSE_TIMES=10 export TOLERANT_UNPROGRESSED_CNT=10000 @@ -18,12 +19,12 @@ do if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi - export RES_DIR=test_result_${DATE}_${MY_NUM_DEV}cards + export RES_DIR=test_result_${DATE}_${OF_ORDER}_${MY_NUM_DEV}cards if [ ! -d "$RES_DIR" ]; then mkdir $RES_DIR fi - for n in 16 + for n in 32 do for w in 2 do @@ -34,7 +35,7 @@ do export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" ## Time echo $(date +%F%n%T)>> $RES_PATH - for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M #16M 32M 64M 128M 256M 512M 1G + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G do ## Test /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH From 1c9a007bf2a73d90f2d4227be1650c5ef765a52f Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 22 Dec 2022 09:27:09 +0000 Subject: [PATCH 072/109] 28 is occupied --- ofccl_test.sh | 6 +++--- test_scripts/nccl/run.sh | 4 ++-- test_scripts/nccl/static_nccl.sh | 2 +- test_scripts/ofccl/clear_static_ofccl.sh | 2 +- test_scripts/ofccl/run.sh | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 3465366..073c8d0 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -42,10 +42,10 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=2 + export MY_NUM_DEV=8 # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=0 - export NITER=16 + export NITER=8 export NBYTES=8K export WARMITER=2 export MITER=1 @@ -95,5 +95,5 @@ elif [ "$RUN_TYPE" == "NCU" ];then fi echo cmd=$cmd -$cmd #> /home/panlichen/work2/ofccl/log/ofccl-2ms-coll-master.log +$cmd #> /home/panlichen/work2/ofccl/log/ofccl.log diff --git a/test_scripts/nccl/run.sh b/test_scripts/nccl/run.sh index 8b92e0f..198528b 100755 --- a/test_scripts/nccl/run.sh +++ b/test_scripts/nccl/run.sh @@ -16,7 +16,7 @@ do mkdir $RES_DIR fi - for n in 32 + for n in 8 do for w in 2 do @@ -27,7 +27,7 @@ do export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" ## Time echo $(date +%F%n%T)>> $RES_PATH - for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G do ## Test /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH diff --git a/test_scripts/nccl/static_nccl.sh b/test_scripts/nccl/static_nccl.sh index 9cf7ad2..d8f5883 100755 --- a/test_scripts/nccl/static_nccl.sh +++ b/test_scripts/nccl/static_nccl.sh @@ -11,7 +11,7 @@ do export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards_time.txt" echo $(date +%F%n%T)>>$OUTPUT_BW_PATH echo $(date +%F%n%T)>>$OUTPUT_TIME_PATH - for n in 32 + for n in 8 do for w in 2 do diff --git a/test_scripts/ofccl/clear_static_ofccl.sh b/test_scripts/ofccl/clear_static_ofccl.sh index dc8646c..2c3849b 100755 --- a/test_scripts/ofccl/clear_static_ofccl.sh +++ b/test_scripts/ofccl/clear_static_ofccl.sh @@ -11,7 +11,7 @@ do export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards_time.txt" echo $(date +%F%n%T)>>$OUTPUT_BW_PATH echo $(date +%F%n%T)>>$OUTPUT_TIME_PATH - for n in 32 + for n in 8 do for w in 2 do diff --git a/test_scripts/ofccl/run.sh b/test_scripts/ofccl/run.sh index f7158da..ab10f96 100755 --- a/test_scripts/ofccl/run.sh +++ b/test_scripts/ofccl/run.sh @@ -24,7 +24,7 @@ do mkdir $RES_DIR fi - for n in 32 + for n in 8 do for w in 2 do @@ -35,7 +35,7 @@ do export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" ## Time echo $(date +%F%n%T)>> $RES_PATH - for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G do ## Test /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH From 54ff526838e821662cac1be18a4c3bdd5161f6b6 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 22 Dec 2022 09:57:43 +0000 Subject: [PATCH 073/109] fix bug in nccl-tests/src_manual_size/ofccl_all_reduce_ms.cu --- src_manual_size/ofccl_all_reduce_ms.cu | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu index 74f4866..ccde169 100644 --- a/src_manual_size/ofccl_all_reduce_ms.cu +++ b/src_manual_size/ofccl_all_reduce_ms.cu @@ -166,11 +166,10 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t return testSuccess; } - - -#pragma weak ncclTestEngine=allReduceEngine struct testEngine allReduceEngine = { AllReduceGetBuffSize, AllReduceRunTest, AllReduceGetCollByteCountList -}; \ No newline at end of file +}; + +#pragma weak ncclTestEngine=allReduceEngine \ No newline at end of file From 57875d6dac337d1e422d02564a40626b345c3908 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 23 Dec 2022 06:28:52 +0000 Subject: [PATCH 074/109] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E5=AE=8C?= =?UTF-8?q?=E6=88=90=20auto=5Ftest=20=E5=BC=80=E5=8F=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 4 +- test_scripts/auto_test.py | 186 ++++++++++++++++++ test_scripts/nccl/run.sh | 39 ---- test_scripts/nccl/static_nccl.cpp | 7 +- test_scripts/nccl/static_nccl.out | Bin 43920 -> 0 bytes test_scripts/nccl/static_nccl.sh | 29 --- test_scripts/nccl/static_time.cpp | 6 +- test_scripts/nccl/static_time.out | Bin 43920 -> 0 bytes test_scripts/ofccl/clear_static_ofccl.cpp | 5 +- test_scripts/ofccl/clear_static_ofccl.out | Bin 43928 -> 0 bytes test_scripts/ofccl/clear_static_ofccl.sh | 29 --- .../ofccl/clear_static_ofccl_time.cpp | 4 +- .../ofccl/clear_static_ofccl_time.out | Bin 43936 -> 0 bytes test_scripts/ofccl/run.sh | 47 ----- test_scripts/ofccl/static.sh | 21 -- test_scripts/ofccl/static_time.cpp | 32 --- test_scripts/ofccl/static_time.sh | 21 -- test_scripts/ofccl/statics_ofccl.cpp | 36 ---- 18 files changed, 197 insertions(+), 269 deletions(-) create mode 100644 test_scripts/auto_test.py delete mode 100755 test_scripts/nccl/run.sh delete mode 100755 test_scripts/nccl/static_nccl.out delete mode 100755 test_scripts/nccl/static_nccl.sh delete mode 100755 test_scripts/nccl/static_time.out delete mode 100755 test_scripts/ofccl/clear_static_ofccl.out delete mode 100755 test_scripts/ofccl/clear_static_ofccl.sh delete mode 100755 test_scripts/ofccl/clear_static_ofccl_time.out delete mode 100755 test_scripts/ofccl/run.sh delete mode 100755 test_scripts/ofccl/static.sh delete mode 100644 test_scripts/ofccl/static_time.cpp delete mode 100755 test_scripts/ofccl/static_time.sh delete mode 100644 test_scripts/ofccl/statics_ofccl.cpp diff --git a/.gitignore b/.gitignore index 5999837..81a260f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,6 @@ .vscode -test_result*/ \ No newline at end of file +test_result*/ +*.xls +*.out \ No newline at end of file diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py new file mode 100644 index 0000000..06799ae --- /dev/null +++ b/test_scripts/auto_test.py @@ -0,0 +1,186 @@ +import os +import xlrd +import xlwt +# 设置环境变量 +os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib" +os.environ['NCCL_PROTO'] = "Simple" +os.environ['NCCL_ALGO'] = "RING" +# test +# f = os.popen("./nccl/run.sh") +# print(f.readlines()) +# 设置超参数 +# run +DATE="221222" +runNcclTest = False # 运行nccl测试 +collectNcclResult = True # 统计nccl测试结果,写入xls +runOfcclTest = False# 运行ofccl测试 +collectOfcclResult = True # 统计ofccl测试结果,写入xls + +NCCL_ORDER="1" +resultXlsName="result_"+DATA+"_"+NCCL_ORDER+".xls" +n = 2 +m = 3 #nccl +w = 2 +M = 3 #ofccl +NUM_DEV = 4#设备的卡数,实验用到的卡数写在循环里 + +# static +os.system("g++ ./nccl/static_nccl.cpp -o ./nccl/static_nccl.out") +os.system("g++ ./nccl/static_time.cpp -o ./nccl/static_time.out") +os.system("g++ ./ofccl/clear_static_ofccl_time.cpp -o ./ofccl/clear_static_ofccl_time.out") +os.system("g++ ./ofccl/clear_static_ofccl.cpp -o ./ofccl/clear_static_ofccl.out") + + + +table = xlwt.Workbook() +bwSheet = table.add_sheet('bw') +tmSheet = table.add_sheet('time') +cnt = 0 +for MY_NUM_DEV in [2,4]: + + if 'CUDA_VISIBLE_DEVICES' in os.environ: + del os.environ['CUDA_VISIBLE_DEVICES'] + if MY_NUM_DEV == 4 and NUM_DEV == 8: + os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,4,5" + # nccl + # 创建存放实验结果的文件夹 + NCCL_RES_DIR ="./nccl/test_result_"+DATE+"_"+NCCL_ORDER+"_"+str(MY_NUM_DEV)+"cards" + if not os.path.exists(NCCL_RES_DIR): + os.makedirs(NCCL_RES_DIR) + # 统计结果 + NCCL_OUTPUT_BW_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards.txt" + NCCL_OUTPUT_TIME_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards_time.txt" + + + if runNcclTest == True: + + os.system("echo $(date +%F%n%T)>>"+NCCL_OUTPUT_BW_PATH) + os.system("echo $(date +%F%n%T)>>"+NCCL_OUTPUT_TIME_PATH) + + for iter in [1,2,3]: + NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + + os.system("echo $(date +%F%n%T)>> "+NCCL_RES_PATH) + for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]: + os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RES_PATH) + + os.system("./nccl/static_nccl.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) + os.system("./nccl/static_time.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) + + if collectNcclResult == True : + # bus + bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡') + + with open(NCCL_OUTPUT_BW_PATH) as f: + content = f.read() + bw = content.split() + + axis_y = ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"] + for a in range(0,25): + bwSheet.write(2+a+cnt*30,0,axis_y[a]) + # + for k in [0,1,2]: + bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k)) + for i in range(0,25): + bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2]) + + bwSheet.write(1+cnt*30,1+15+k,'nccl-busbw'+str(k)) + for i in range(0,25): + bwSheet.write(2+i+cnt*30,1+15+k,bw[i+k*50+25+2]) + # avg + bwSheet.write(1+cnt*30, 4, 'avg-algbw') + bwSheet.write(1+cnt*30, 19, 'avg-busbw') + for i in range(0,25): + bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') ) + bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3')) + + # time + with open(NCCL_OUTPUT_TIME_PATH) as f2: + content2 = f2.read() + times = content2.split() + + tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡') + for a in range(0,25): + tmSheet.write(2+a+cnt*30,0,axis_y[a]) + for k in [0,1,2]: + tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k)) + for i in range(0,25): + tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2]) + # avg + tmSheet.write(1+cnt*30, 4, 'avg-nccl') + for i in range(0,25): + tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') ) + + + #OFCCL + # 创建存放实验结果的文件夹 + OFCCL_RES_DIR ="./ofccl/test_result_"+DATE+"_"+NCCL_ORDER+"_"+str(MY_NUM_DEV)+"cards" + if not os.path.exists(OFCCL_RES_DIR): + os.makedirs(OFCCL_RES_DIR) + # 统计结果 + OFCCL_OUTPUT_BW_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards.txt" + OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt" + + if runOfcclTest == True: + os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH) + os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_TIME_PATH) + + for iter in [1,2,3]: + OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt" + + os.system("echo $(date +%F%n%T)>> "+OFCCL_RES_PATH) + for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]: + os.system("../build/ofccl_all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ OFCCL_RES_PATH) + + os.system("./ofccl/clear_static_ofccl.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) + os.system("./ofccl/clear_static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) + + if collectOfcclResult == True: + + with open(OFCCL_OUTPUT_BW_PATH) as f2: + content2 = f2.read() + bw = content2.split() + #bus + for k in [0,1,2]: + bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k)) + for i in range(0,25): + bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2]) + + bwSheet.write(1+cnt*30,5+15+k,'ofccl-busbw'+str(k)) + for i in range(0,25): + bwSheet.write(2+i+cnt*30,5+15+k,bw[i+k*50+25+2]) + # avg + bwSheet.write(1+cnt*30, 4+4, 'avg-algbw') + bwSheet.write(1+cnt*30, 19+4, 'avg-busbw') + for i in range(0,25): + bwSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ) + bwSheet.write(2+i+cnt*30, 19+4, xlwt.Formula('SUM(U'+str(2+i+cnt*30+1)+',V'+str(2+i+cnt*30+1)+',W'+str(2+i+cnt*30+1)+')/3')) + + # time + with open(OFCCL_OUTPUT_TIME_PATH) as f2: + content2 = f2.read() + times = content2.split() + + for k in [0,1,2]: + tmSheet.write(1+cnt*30,5+k,'OFccl-'+str(k)) + for i in range(0,25): + tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2]) + # avg + tmSheet.write(1+cnt*30, 4+4, 'avg-OFCCL') + for i in range(0,25): + tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ) + + if collectNcclResult and collectOfcclResult: + bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl') + bwSheet.write(1+cnt*30, 24, '(ofccl-nccl)/nccl') + tmSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl') + for i in range(0,25): + bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)) ) + bwSheet.write(2+i+cnt*30, 24, xlwt.Formula('(X'+str(2+i+cnt*30+1)+'-T'+str(2+i+cnt*30+1)+')/T'+str(2+i+cnt*30+1) )) + tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ) ) + + cnt = cnt+1 + +# 保存 excel +if collectNcclResult or collectOfcclResult: + table.save(resultXlsName) \ No newline at end of file diff --git a/test_scripts/nccl/run.sh b/test_scripts/nccl/run.sh deleted file mode 100755 index 8b92e0f..0000000 --- a/test_scripts/nccl/run.sh +++ /dev/null @@ -1,39 +0,0 @@ -export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib -export NCCL_PROTO=Simple -export NCCL_ALGO=Ring - -export DATE=221221 -export NCCL_ORDER=1 - -for MY_NUM_DEV in 2 4 8 -do - unset CUDA_VISIBLE_DEVICES - if [ $MY_NUM_DEV = 4 ]; then - export CUDA_VISIBLE_DEVICES=0,1,4,5 - fi - export RES_DIR=test_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards - if [ ! -d "$RES_DIR" ]; then - mkdir $RES_DIR - fi - - for n in 32 - do - for w in 2 - do - for m in 1 - do - for iter in 1 2 3 - do - export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" - ## Time - echo $(date +%F%n%T)>> $RES_PATH - for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G - do - ## Test - /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH - done - done - done - done - done -done diff --git a/test_scripts/nccl/static_nccl.cpp b/test_scripts/nccl/static_nccl.cpp index 3c8b2b9..f12519a 100644 --- a/test_scripts/nccl/static_nccl.cpp +++ b/test_scripts/nccl/static_nccl.cpp @@ -5,7 +5,7 @@ int main(int argc,char* argv[]){ freopen(argv[1],"r",stdin); freopen(argv[2],"a",stdout); - cout << argv[1]<<" nccl : "<v&HYXGg@Lhg6%z4ke0 zPG(Y!!~MKJa@xr{d#$zCUVH7e*Zy&4&c5ASzdFa|($tl!eORNEdx6AMGhxq{_y(X_ ztJ2QKU$=IFHVx@Kfs^!V2|$(CNKPs&V0s+r#bcNa>$g7$o5o8gU;KHQZGeO&JoLMhN#Pcj$f zOLD#>=ToqV?Wv%$C+SF^DwgYnZqVp*i@%!Y%(sE_IbkjHD@bL~)%>#2KAG|wIbU9l zl;ecExqb?&@~%Zbvdg=@*qmAumzSv>O1a(&y10N3b%olNRD7tbeQ{SP9PeG+TeW2I zl8Ta8q@+x6kX}>UAsw{=y99zM4VL`!eWra|ZlxGRXf$2K}$j!2iJv_-8WU@5@kM@;f&^&Z{Yd z{@=)er@rHM3PJj+4EUdAu;*Qne~va+tL~A)R9yQ!@E4_#|0LuP|K3|9u^4H(Ze=-* z4mm@x5Aknjd=c|&hz~KNJ#hKu6cd(cy57+p3F|S#A2oDc)9YFq^!8vh*b#~u!Dvgv zs;)>l*y3;N3QAgf$_>p%S*flYozcivy(<{*Fgo>MG#ZIEt_^59s9mwTK(pcT1Umgu z-H7@_1`yt6KmK@|KqznX$3g);5;LMfe|KGr-cqW2o0sZMYu7a!OLaZa+w1Xo$|M(< zLg9|O<_f*JQg89-UN6eX%$Kp?RUYpapZayMvraaGpLX96yFq##k+$PUjiz$ zAgTmz)&re4>znhn0I9*m58CanDyj~ zzj^qZi}^|}<}!qp|8lhH0-rDVXy#PFbDccE9AN1qFBfJ!wMeLOqVbY#XA%F8nZX54 z3dKZk#AnON`!Qlo(}p)((qMKd zbhZY4PcbfjUg!*(nN8#Qx|+|c@kU!nxioEr=j#gp#v+M7>c9^!koW=~R}{ZWABjjh z&H3E&R{R~d;aL@dAF<&-VB zJSH7!&Y&(ez7pO(KCAJS@N629MUB&hzffT0pCZ<0I+9ejx{7UhID~n*ZFmgn=C#O% zXIBukQXAgB?x?ciscdys+wiI_iRQE6;XvlqXv3drVl{1p4G)JmuT3`m3=^wqoi;oi z*1URbcwV9kTCWX1)8y8)?KV91DRu3#;m=V>z&$p+eSZCb4gWqH|6UuO)tXT#4? zNWlFz{JA##0UQ4PHvFIsf1V9LbvtE&DGN+lV9Ekh7MQZYlm(_NFlB)$3rtyH$^ugs zn6iMTy)pCrzxn!JEAaK_|8hOHD|QYVIb*}VzQYBFgn7m)|8t$DjgDRXJiyEg1b-({ zMvwhwY;0@~<%+c8!e;9ZZXre!7_TxZ>DKuW%Ss!W|}rsMvv8)Y1&8`J?1gfw1F~u>_Rh58z-a3 z&N0)pVKREmWu|9}^q<~Q^`#9G)Za|g#t73^}OJ+Id#p82RR@mpWtOD7sz z>Id?tVXM!bTf90nLT>6eLal&`-4_EsRp3L>Re&T!P9b+~Gcv(Qf*wM|R-Bo_-f7@5Q z4)@UFm$&-*R*bI2tuOJiF~irt;u2&_KKl09Sh5|?{FD4Az~{ONdCl@E|FNwg(oHM! zHT&*(&4&)0_~B#7=SzHdKR~;$Kkwp2La;aK8T9qfnN14y?5h!I$_&@@IrJ zH7JJkncvf9{%Ei-S(@1Sf)KFNO@%x9#R3G|M3QDr_{Tt=` z8A`I)H6c{6KZE)X?Ho2srn_8+u5psE{lBPWBBSI!QF)h$l4z=1u;~g77q0{%|Qe<$7f0;m9MK9qct?7r;K%zFlR3>s)z$uVql2xXrlCgCWLLw0fi zD#@a9t)dPHJt@2E_cj+JOl`uc7=p&9R&RztGCL0N2WhO$$%O_gPuPK5RuQh=?m zJGi|`@|u|vSoM9?9>~v%bK~Iq?rqZ44^3n&^Z#KSRcs~eWl@~73uPm_q;VW zX6szAv_n>srs-%X$~9ZY$Nc2Rlw1ekBbv>c&FJEtFy}smm%YA3>jQR?Vq)dp=Np)F zEfx$vqO4Tkf5+;{m++G6`^YmA-XroBj+yWdi+iwrMwrFB-#5^Tj~CKK?T$$Fh|JPD z$mq8u`Y6yxA>{zz160_zDOx9}n~nGm9!0l>e}ltC07{E)dn1hzLtZgFBws)R!}Tqa zK9qcihC~`3NQ0r|e42{%m%>U^)Dd6emxTSMEd7A5&wIeFa?%XSmh*SSBRT?7 zgL58_sB8nN*&Y$uoCWSnS738$&S=^K!<;ku2Dtl9xKLM$0I~WTE8456$ai_`5#PbT<%3PJ{4-6EBnzxx#J2~02@xX(rRAv+V0i*4t+8kiZfGc$ z6Y-flJ`4Xi){j4OFda&|Mj^HDwgXyx0a&^HQl$u&cmob4!kDN;gPN>0kP|>1QHeTB z!$Byn(63+ul>92n;Q>7PLn>1a+mGWmd6UrLaHoCDM%1zgn7-HCDFLOQqlnd)Z-s)0 zVzxj~WadZ>8QUD3W`H!~Kuv2+3-!81kT8N?2S-A4h}IfP(g0eQc*56z6VNaI*bQSQ z|MC~Y9j;IOp+52Z(#b_6;NLZ<`cBLPT5$!A4}Jft-K7n&wrddQdfPhbWyfXUf1)_Ul}V4n|t z74J4Nr82XAOeEnK1Be0xfHFY*2(_Vp<^lrE&z4z1%*(t-qq7udEhz%bmO_PD28@{E zAeViTn4i3!wCFqRvwMMT_VMu;t#8HR$0)R9fJ4cDMdSJU zi;;x04kaHJNsP6y6}kduAbp36Q}wZ&N{ne(r_Rm$u5oWpa zRU$crVkr_qt7j;PCtgh6OayXJOD?Kl)jYL3#lW4~@f=;12F;=5qCce0J5vP0hF5t{ z7;}X+!MswcmZUHGBi^5MAAde&hN3cy?vJ5?*_Q*RnBAtc9QePF7KRN(xcZcnJ5}!l z0vH7`5o+Qmlxm+#m+LehN;6;Nhm6$OLi_6}t?3MMLW6OE?w^GMW~@qW8ENaT*HUe5 zM!0d&kqvm2BZG9<5vP+LKuK;5*-5*rNqH{Y~i6(e3_7ahuw(U>0>sB0SeeQGnd*`_=$JSC3`vf6`4vn z;z5YER5u%xD~{0$RI9zoVrE0w${J@$iUn0D9yR6>EYUECVvm@uca?Y8SPqP^$AWRZLHW1XO!%{jmc&;A0bsY3ow5KA)<&zJ6fE z8y-!oxvnnpTup0T;;ovNn#8GA-@xM6fNrk8^ld0pkQ{pnr5t=a*SNs*JnzTXCq^;# zt4;jAW^DGaeSL>qzGc6Pzlz!w+;|gB@@hU>qaWIA6-iA-4KJLRo}-yh&V)u&f2J%j zWq~OR{C{f!7k<;CE_H`NT41sDfnVHMzrm@-{dRoB4=#wA7;v4I-^palLsKFWLtg=) z$Ntmtp)$|FhaTgfh0hFp#AFqJqMBcGhXa8w_myr5NlTEL$Hwgcv2 z?z{tVHsHg6>i`b}ZUTG>@Y8^KF!WP^3jyb05?>401V}%yZv)%`cpYXIj{pV%p8~ue z@HN1LfHTlY&jMZnxCSfp4S-#My?`6BBk~a7KETHSUjuvrZ~>-XrvPsSoQDSfG~jZ; zR{%Ew7GaIC9dIe&UO)qI5b$$=M*)ujl2g0`=muQ#=Ga&@;C8@GfL{XK4)|Tby@1aG z4g!7{`>{s>8v)6o?gMlKz5`ebIPcG>H{i8^I{qJfO`NFfcpWT0UQP_#74>qz_oxy@RN@MmI8hauo3Vn!0!Mq$29X4 zp!#i$>*GzDtGCECcgFOBJ?LPB$J21F2mbjXK&^JV*t%bc+X{R(yny;sM=UH_U0A$k z=2=?{wrf|PcjaZ}3oimz^>iZ(*^%hi0_J@I)>V4s_=FJRq2(Ic!bd!J*m!QY%2|&1 zRPgfH*zMr4WNb0(u@Ur6&_8IWf7HraGH(K-t&K;H~{ zruKY+>1B5LAuInW(0h@8rk%dpqR&IzB>k7!={**GIq3g1fqol7C;dz8`GZ#eTRHy< zJKeD84}rcH`DvUm?UQqjU?4wv4D?~pm)PmuR{j@2e+Be(^&^{q2lamr^fv+XzJy?_ zZ0vvk?CJMT{Hi;y&0-K(71L!t=`8feg)-Ytf3f>FtWo?jjH*hCFe+~2#b~@^t^IJ>CY>Zn6Fb>w+ zd2&_>2C8ob=r4g@Yo|j7c|OrwKrg`fxYSM$Tk;LiF97``c6v^uDgR#3H-Mhb9&Y6M z4(PNCvB937>R^k7BjEW2crw}iB2;%Aonqx4|QRAFcXst5rvu2ki$vlZ|M8^uz@DY2I`g^kTaX>n-^-pLz`RbT)J& z&qmO{3Hla$e)7{!DPtku&JfXqT1KBpApRM?ilH z^egOiTUUD#$?yGB130?%cH>!xJ!9 z{_6$WH*@}XK3Y6+23SB5AKX27v1{&CH=v1*R-8Wr5RKKx`=B`xsb?(xu+Br`DoQNPDGpDLlOckggdLXqsEzoXu(9Rq{rx z%We-5V2bp^BAMF{?g>4UUz4gvmgl_g?FO@DVgT*EGUvP)*Fma7A9ZPg^ z&0V& zZfCfMp_DTDzv{h{N&;RGdZl|&Yg;^Q#NEsAcF@wrp14SPK2cU$Qd&{sxm4m+j?6F@ zy{lL?w2h744qBBKpR1|&(A(JI(|OvhY5DWbew4~TO;i0R6+d0>bJ~%(&xNZh|Fe^F z;qA(AQ}JhNs(+;7&(c&sO2xyu-FA$;&&6p9?zH$qZEu?P&DCaV`_tl!%yS#;rR3rb z)+%nL;<39~ZO6#_TfS2A7J{^bKDZ-nd^> zEzoAUiy42817FSfSq^*?<7YEo$-jm1#SZ-4jGyPg|10C&4m>@RCp{NB@PB3eA_u-0 z<0kQ6;lR_coe5v+z+cb!3I{&U_$mkfUdAtX;D5mQY6t#zjIVXz&&rkh_#F6h#@9RW zH!{A_f$w8{iv#~<#&2-o|AX-x9r(X7ev<=#F&dQY*6zTsV|=FrA7gx%1OE`?dmMQB z-6P329C&)(O!!_0{#@}KVwQHR1HXdt+Z}j%mO*lMIPm?9-{ru6i}AZ1_+K!7j{{E= zK$3s21AhtQA8_F78UK(2zlHI89r!OW{$U4xi1GUvujX^F15a`3%ZoH|iRV_hn~#Yi z;njRw&5O~E(#9nZ_+1!BX0MZS?0?h6v8%;Ibo_WU>mn+5{CL_VKiJRoMoKhZQ#?LLvaoaNc^8$A{jfF-(dVz6%w(W@y`K2 zi|_4VJto{F=lshgRy+sC?OlxDacL_4CB{D|zLcbCO3(T;rJV0EUdh=Fe35pZR_qv0 zKg0Z^ET6M!e`b6O*K0K^yB7l8kmDSeuRlwcyUrsSFJ%6wfIo+7TrCmkp5l6*`P&`! z`Y@V2oqgJYC;MzKm-!XXFEjA}MeuWBHt;^IcT`>*TBGkeVpP-4SSYddoiV!n8Th{n zJn1vLO7cA_fmU89@gtW>#O;h906ty0G><$d=RD0hKYxPdJSx6SV7+7g?3r9H`_%_o zew6V}`~M#JbmdlH3PSqlh%e3Hd#x(GK>lnX>?WyobQ1Mwg+8;dzZZCt?~Es>82>@;cOT(;wP8bn_&?0;axLTk zjq&$fAra#H7q~6KMuMt4vkI^hc#`ul2Oh=qFyajLNB08Bt=jP`%>Q%t6D5D%2hz*; zXTTd7@b_fEe+78b^Yh%kC0vODghImK!2W>lCN7=v&iM8?@T!i?D!_}tQyg%{=T}+I zi89H}-BnwI4Jj&jGy4O5r;x5+Gu|1`S7L*T_;c9NR6l);@lUY+O1F#WOa5YR$7NEy z)(^bmVUz&x0H5w0r8x*JRBk)VQFgnT@hvQv!>;xN#$U(vQubU5e;_%%%n!F0*Dl5% za`gAl15e|fbDZixoG-$9Z7|9L7<^s$TPluHZ?Sx5{8{Ce`aJ9yzwQP;oqfK}{OhlT zP+W><76wYH*Hi3QOC(vlA9&K!=?`CGe&_i9JK&W}79l_Z=26s-hPm7z6N`bTa%Zsq z=;q>bGk%2qB*ggJ7=I7PfenoRGvjx&V=5iI=(i;QLG~*S6WSi&6%UIL;E@dYpR$}y zWs+O@?FDcclE0D-reyA7{4o1lBNsBl_|LPRTNuCMVkzgFY#-%UcQd|~`PKM1%J@fE zpIVl4Cjvg{Q^4gazx@^PZsxIrzcRnU@)6d=wSmSpJ0sus08jFr{`MomU!+y@zI8tH zKh6Bk_`G0|l(U1|7v>e$F2>JdJE*w&8^$~1TNV6<^x4318^ww1Q;dI={Y14p#iMlR zvK4@V zbh2CwXFRn>^p37bo4-qM#}A!ixR7m$nqyBAr zFlv=z=3o0ORSPWPY;grYBuUU%X&h=KyYg) z7SscgFb*?|2XMlcAeD=sUiqORAEwc4pl^dLZZ<(xyKP+4^t#U@YLDh>Z(v!|8g_;O5Ns<3ih_ zU_32Xk3Sj=pO%YcO;CXfSpgg_c^XDmE$R<vcGp{T2GEA@IYDvKtFY@1w}UhfuBkf_A} z#KEX+I(5(`CPEn9C)r4~W#Y733iM_Ab>c_m;>6fVHMuf9AEBFFp*WghF=qV2ex5~42Y5F)RdwZ+Q?$zD6mQLoiO)=?E zU(7?hF@p}_&}LD{dmM{vU#u6i5;?R#uT^O)Oo zM;52_3sPjQ&D`&;H7e+oW$H=JLzAnkZKu#msyfKl8388|?_?uX=qWX)Lv1OZPVW5b z6j8D@=@9RD*c`cnI8+!jB^f+D<%%Yeh$H3Qku4bc{vn_9B5Cc$;v`H( z$BK(_VOdW+)`=L00UQooNs=cxr`%x+#drG}Hq_1`UxEuv;NKg~v1 zNEvt=(QC!g)wV+3+s=Tn%Aw&OI0`+*@Y1DeJjsf4K|vTrIyX#_?il%u^BRgk@2PGc zrJJq2vf&iV%)JYW<7@Ci6ekC)?OBhxl2&_t36>2d*rGVyq3eIw!Q?dZIqbDWbRAEr zzDn(;Ose;+5#8BV1F^}GjS#_;qGqGKI~1<#hBsq^S4RIGKJ#~3B9rH=Z8itkecWx@6M}%yc9L@=9%=0piub?^5ht8}NaJ>PEK=3&@6}_pdqqYQ;B6P=5pp@?7dhah?%#TEMc{WHMg@a%|=(OE@KdGA`^=}Lp_4kT+rWL zhh=Q3?uE&k)@JTS%@sO=23gSSh)Hr0mCZ#^y7h{=*vd*fdC*R3!R)2AHI}-x+LaCp zD_Cm~nx0O|MI7Y@wFAc5A{5OPJdMA zU6_CuTPWkaJ^+Ow1+hNGpP_kpYqt}%t~t^b^ER$-YK}DPsT}|hNrI7{V7ic!&H7vG zRcc}9tTY?r3#N8r9h_C?xB3yS6&B1=)L%O8Rf2U^hqO-v(k&h+_$)#0&(;)tNss%a4gjuu+wR78OhyPnixgj-B24EH`Hb}@5Cc5HYWm+xEl23My3eq z)_Qt!TUFTjrOjB(0%)d$2?1Nt*`S#fpPYg3?huPgHWobmjOr0IGdpX^ZX3mCjJecCr_aZV(3@7$4 zy)72AwfiKOj?$v!H&vlXOgv7gsKaxwaa#cxDZ4^#0X-0li>Fqw0;VmEJb-7#8Z<{X zmw{rni9#@RR5;gez>IyRK)aHExqLWKjlGY)RM=?g|A{^!W4{REkNICYw@>< zX9g(`rLwW&u`G?a*&5y&LN9I##^T*UtVvd_(#uOqvp2fP<&_n+>C1LfdKP=D8sD?r zI7`)2emIZZ`4%BGc+2FR(c1<`8aElE^|*Xmm1XVKpo%iH5o;^VeNhDOY%cLFE7O-S z)J~4Dba}wsvoxptSsFS*y|fmVektay;%hV3qTKwB30lPr&XKl{;R_Ggu!RzT?_(g^ zbx{GD9zirBN;iqGndlwe_#%%~LJrcNs`1;z+1wUyHic+`Hw367iZo}UOXhbFKRCly+i}q+GSk(nf@J-*6o+vie zqQ*8Gsx2PsYF`{`XH?C~y2XaSLlf+s{#d70(!MQ>A|y1TlCmWjjp6%YDG41k__Cie z@Lf-rp_R~sdaVSvB^?pIi3I~%3HI5M6cM{58WCT5DG7G+7ot1cAx))FOdz21UJB}n zflEl0-<6g!)IAFKcZVQe3d90JE5Y})yYb!Yly7`bDuJK(XfAwmIJbK5hJxxnG)g$} z+AsRP|8#s*{;FI_RnRSy=3n*uGlig^TiEk&;QR`z@9!)AJaxmc&XHfe??XYigWs9| zdeCuXLqzGX-mjw||17|Kzm8LXKk}c2kIFy7`YX7J(^N;YoyxC%pAiC%SX6%XJ|6|u z`+LA5E~ou(0hZp4qw=fwktpcneA5^zJ_SDkJiTj2rPX^&6x_@CNPp5(>Cb3w05@b) zm0!KzM8WN>uqt2WSM|T2^RHxq>OCk5s`n33{tw{e%>M=8s0bxrb0a~QTl~c@o>DJo z`#%J-J-^#6`4v?9D=9^Mqu^JNw&z#xUr|u;Wy-(bkzc*1MM3)+8ze)?Q}9toe)aoC z1us^Hrp&lhKAFO~GPtqpuipEj;Mt0iq0@d(aDLVPS~W8=%p>8pOSPXu{S-IEqVlWv z$SAm55jcah{O29{)%#`?+;t{rWZ*3SWk-JXUK$0RA=X*`UyvU^trmTQics$b7~V*! z@%C2bE0~MQ({H;tOqlN*ILi4IpAd#WDy`s5?{b$x`K4_i(;V5^i_2f3-xL ze;081r6xw+6mOCcKc`B$NLRY-&{2e-`!)%nZQ>vB(eL$ie9U@FvUsnV(w$KP?2F6P g57ZC4n5fECy!iRLS$Bma82F-0(U~>&%aPRn8}@h(vH$=8 diff --git a/test_scripts/nccl/static_nccl.sh b/test_scripts/nccl/static_nccl.sh deleted file mode 100755 index 9cf7ad2..0000000 --- a/test_scripts/nccl/static_nccl.sh +++ /dev/null @@ -1,29 +0,0 @@ -g++ static_nccl.cpp -o static_nccl.out -g++ static_time.cpp -o static_time.out - -export DATE=221221 -export NCCL_ORDER=1 - -for cards in 2 4 8 -do - export RES_DIR="test_result_${DATE}_${NCCL_ORDER}_"$cards"cards" - export OUTPUT_BW_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards.txt" - export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards_time.txt" - echo $(date +%F%n%T)>>$OUTPUT_BW_PATH - echo $(date +%F%n%T)>>$OUTPUT_TIME_PATH - for n in 32 - do - for w in 2 - do - for m in 1 - do - for iter in 1 2 3 - do - export INPUT_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" - ./static_nccl.out $INPUT_PATH $OUTPUT_BW_PATH $cards - ./static_time.out $INPUT_PATH $OUTPUT_TIME_PATH $cards - done - done - done - done -done \ No newline at end of file diff --git a/test_scripts/nccl/static_time.cpp b/test_scripts/nccl/static_time.cpp index 444446b..4a29f77 100644 --- a/test_scripts/nccl/static_time.cpp +++ b/test_scripts/nccl/static_time.cpp @@ -5,7 +5,7 @@ int main(int argc,char* argv[]){ freopen(argv[1],"r",stdin); freopen(argv[2],"a",stdout); - cout << argv[1]<<" nccl : "<SKDjvwLY{|gP`HQ*V@?URqBtHw*HuiwlUQfTQv8(_Fns( zGbb~_rm_FL9GI-L*IIk+wbx#I?H_06?AyIH%Q9RpOQ0nd_XO+lq0LCTlzI+8Cb3Tm9D zeAz^D(;r-J@z;b~&y;Be?fFD`70mB|x8+K_g7$o5o53}bKG>cNeO&Khp%iGWCz*@# zB{*M#^C_6m_Eb>WlXRp{Im>lIH)wRZ#a~Tx=3CAAoUn@d6{IrgYIxmfnMiqcobSB~ zAx3k;ZCpPERe7I9KC;Wlz1W;uJ(riR9g4W#3X-4Dbwzuqxvb=h_Lj@rL*cIO%e%|V zE-x!7j718I1)cPwGN}2MuV~Vsc0b`*oM?+#oF@AF@1B!)<2Q1Dbl~Mxn;L!heQ@(j z4|^WHT2+>!?Wn>6xYOr!rbY4|TrgMTg!{@yh8CBJjyPpRcJ04Yr12Mljk~ zyR!1vb=L(n9n|(%b)doUcmi$y zsBT34Ap;0+gCBpq^&l*0_Qyg2JrXmbL4QYequyAgdm9$&_1CRvFc#{1pu5}S@f1rg zFonXc)eR+jL#f{A(Y;=jk)AJY!Am{f^+LOnV7R3{m4f5wRt95Du+Wvz+>yE5UNbhr ziS=F(2!*Af@=zqELq@QqIvg^pWW`oAMA~ECx@GkZkp`V?1Vh7?vXz?sExNzGJraNk zU|zr8j5ZJJ0lyJw18YlrowUuu)}Vo*B$&!PW7-c+<7mJ1&63o7dTV#L-WiO>B4K}f z$k?c_FLGEYx$VcZP5pJ!zQXIqI$f$Z4K%{rN_FY2;SC+Bj4HiFm?|0+Hk_D-X5(0X z{WrBf=Wj4d0+B9*8xIa_HeG`e{dm1?qOFAx(i#ZpF^V7_O>YWs2%-4;V63YnNbx10 zLJOiw;1)g5c8k8wA8OYScgSCb3H?T}J7f@wDzPpajC2MSZC!|*;0uU2I!WW;)Zs_w zfB0cHS8A9=YjBR2dT8$Myft2Gvphi&+?ZT!b<_;YOdeAb`ZqQHh% z<1y(-a|U&(@s;rQ@mY1*R-8 zWr6>{E%3*@bN=G%c{|6~oBhj`*sj>xZ)A)P`g#uL91w;XEq!T)rj3kV_%cA=xf=1e z5oKiPH>0DY+eMl-U`B?XGt;#3GBUK^Ow)$T$j}eXG;Oqu41LE;)8@{|(A{R5He5!A z?l9A|(K0f0tC^+^mXRU9nWl}Ek)h9;Y1&X38LBkXw2?A04R`;n?IZIGb;W|}reP=7N`8zQK`nWl{p)Za|g1_yS}kv`jzp6y7V>PTlh(jV3& z>-)YV{f;C3x+DFnBmG~F^v@jWryc2^IMP3Iq`&V-|Fb>qd8a!5+|9oDZ+$(l4c9f+ z^ktV}>&e%5>XT?FU*F>WXs*%O_aMuA+bM|~Utjj)1Qs1LW}|C`=^kC{q|qaJ=WYhO zc7P%H?-u-}cM-VcBVT;jxBvIo`1T*m^tqn#J@=6@3j(|>AZPT*I??VbKlQ)Oi{F8J zXkC{z`Fa+QT!&j<{B>i7uXpi9$d-8Q!_m=13!M3x?5Dx!x(RvB@+tq&1`z3{3Hcg) zcf9RG2af+_AM*L)KYAFT#n+p4;XEPOoAC7edS}lfg?hIV(bvexy4#Rn40 z;kn>B0#z)Ye95zzdFBe9BmwbfV2z{5lz4%}_LY8$c?X0hPr(jnlSuHzUrhXhkfsI& zke>IbmiI(|PogNk^%WsttD6deLT||AT25XplSc@E`U}J-U*A@ZsRv}Tf|LI$nPrkR z43SZd_;0&<+ZtS9>$Tio3Z#k6o#mWBvtJ<`Gy|BPxE~R%X-T4C0 zh*cR#6haT*q62yN^l#}m&{UFR(Bu%x(zjr!Mf(=N3)zW%P)Qb5Y!&sU(37&ej@}F1 zW=Y-pDZ_(i2A7rLQ9|}+|G}qZe~nmoo2)M|0dCYwNYyL@iQ|6}NsSx-KJ)}r-+{yq znOw`%KS*ZTKm&<>B=?d#_456d7~b>an~ul3j`eIh?&>POW79G6+OA7r z^GB(96GE5SA1tT3mfi;xZ1*yyAb1Xl+yB!P_5@{pk_P<5E;x#42D7P(UK}eI%S1q0qNbHWW%ruRj6(Ak?;1 zNQjntqX<$5T8&l5ZcMv2dteHs**@Hlr`faP(R=GmQ(W?tFFuF?AX{_{qx6+}Zo>i*wm z`&@$#e?w6=jH;bbU?6cPLQu-m2jEKFyaN?9>6A@9P$8a>IeZ#PyHC1J$BH*4d_9|n zHRIFLYDo6hDvQlYQ+ATJ$+Aq-iTF7~3b6U7gWH=RugMc(7M_5TExU7Or``x&)_8x6 z#(N0qLsRsjbHqilK9VQ;O6@T#)MY2u{AqO5iWby&9kLQMxkW=!u30ibW+yI6%C#3h zqS>t305NSa=N`nD-M)C!PP<4k9r5n*_02vB)9NQEE7kY^&>$Z7lInZNGveMu@)nL6 z_YR7Cu>F%Ti}zt)UlTrFNEfv`B+-XumZpA2zc0~8fIb2#djWS+VgF9yI!@i}uy6kn zbX)j0I9vpvwCJ`s(wH&e72`wV6(lew-YV$>iH~UPr160?7)YFr1(C0}2v(w^4*BB0 zB3e-W-o0*>^I&q$-w}`K2vDwTkaHTG^Jqk6>r2k|kjUmNa8IfN85vyEd~-Zg#hwIY6VnIm?xL8qOF?HUiJCD%bE`P_WvatY>MUEGz*a| zuzo(@?)Ak*jOdq^r$&I~381vP{O!1*u~<%D^R|2q{xQ^xKXNo3Nc`UsNbT9QSL>P! zR&Kv!DZ(ZG2!|43OjM#)&4_Br380RsM75=1KNMH!H)w2*{ZY=b(aec2UHxV z;V^m~90|=KT5BLd188;pXzAVH;kG1)1L`=Y(HIPb>FHI+BnO10FrW(M(mdt4=*d|*2So%*5E-m#GYq=CfqqC*}~ z8vpYLuBj&jr=(aRp$~)o&0)~Fb{mNoU6GTY7fJXBGhyZE7iV{D%S}-lxDuj4;jg` zg+kaQN#7J2j0tq#77EzsEVf!oTbI3)Y-=;ZjfswIz-&hb>9B{LPP!MNkFHGg`h@fu zEWN05aTMH#UeHJVO7w&o6x3-bPOkD*D9OHhNu^q{6V&Yl22A4t&57V#QYI6_J4ueOfyFCvMKfr(Ole+@Ri&?z1uWY0l2 zqIT+-&0&B7w$02%b`_rSj=E$oC%+<72}j%y(U$6FSL2Ffv;x&?Z?c%#5Vo?$SdwBv zIf_S(c?65s_M_NCX6s$$9W<^6M%d(8p4o6{d{QK73fC{gf-nB_#Go*O_plqCGT}b@ zxTr+W!E#%q7+b0;u@7;^8bXdh6dH1oL=7a|!i{L)fmsF;6{7fG(d`_fh>Qb?7o-9x z`(ffddKHl_W!*!e{Y-nM#&7LJim}7^U4VljWbn>o&*`#7UiEbjARw#J@axg=*H9xl_^&N8w znNK;!wl#Pxq;r51p8XwCQ-%Czh@s*fnw|+=d^=pkySzz*@5*JzS{t- zE&4l^5IO&csf5s}os!rW+tKeV*>qvk+ezL$dl!Nsgts0-7?Kr(Os-fb!YNnm0;(m`+5$ze2ZS~dJFZ+x$!2N-c{aQsUKLkt}yT1 zEmWK+kD6S#Bt3UBpEL>b)SoE}Oj%&c0#g?F2U);{-?XSp-C?>Ru-L@GFK(>g;8fs# zGd|)67sN~qxK7CLWU}R<$sLKIuK>_v|LOQpnJ3{xkMU2zX9hlE+KoR^&9Aw`fk3|JUtYS1)~OJx#nbEbtm+%!~OhU zkB;uc{X<7aM_&Wn2Y4KC{%=P|?+2XyyV242pyZoxjE;6fk&-ttVF0WJoCSOUumo^D zU?boIfCk_*fI9%+0DJ^68{_d=sAvh`)qraO*8pw?ybbUdfV%1=a*8raa z%z|;=0K5S37+@7(0j6~=fY$D(pz-!1?kphb{vAJYXH*y?~v7j{|N8{5jymfS<-QwL!o}z+u3<0rTM} z_X8FI4g=N!&cGJYBY?L79tTvvg>ikMUUPNlyXMT8p0gbtitu>)t!;*XUID08O&1Rx zF2HRGK5eg{{?zeu^OxloT$^{whMdjXHD_LV>4FQ+2Ug@?1K9s6^dJOZB-g-aFYuj| zpD!wF8$O=bMn`FfUi0SWe<@?>sncqq5RycH2%n2V7r*OK^lL5pPeDHubfK@Jue0cH zgMKOK;`d;RzRIG{fIj2V=bwNM@tW%kbv440)4y4InWoKA_9VN}=M31Bp7&?v!AX=3 zzMjm?{h2}$s^<=r^BeU0boF=y^kL8;-;&X6)#G{4kAt47o^E6zI}-h~fLY&xb(J0& zJ|Tp7XrW2A@Da})HlEw9a;`>vS^=Kh!DGo-Z`NZC=sy7cVmtk2EB|evKMFe8#ms-B zMc)PbmnM``MMc)VdL!hUsC)x28reg_V?T6w}Zago}cPqi-m{5a}Ye~Y<>*%XF*TbHw$3q z{{sDLdl@mS4&|VafPRskev?IC4LZ#g()oTj=+iJaNawpdK|c%hbZxK~bRX!Y_VU}U z@(+W4Gw5^e^cyVtG0^>>r?N4%VFBh24WOrsE#;usgC4TW&sZx8p!_sP*)@)Qn!9`j z^kw$^o1}hj;C6!k2U1bR9f(fsHX z%)ip*r+L%KpcmM6SZT?p`BVnzsch&*o;9Gqhw|6k^OK*pNf{RaPj*}e8IR#Us{w&X z)r0sRw9488SucX`M!T$xdci=pI0X7k>_E@A(`|7f0s1+hQ<-KNwy`D$!}0l`&$iQR ztn%l9{%O#Cb~<$Y0$8YgAL!SBo-UTRfxgB`NB#)szZ3M0pr^7W$=?HdJLpOM0r|hk z`454<6ZCZYCqR$9G&+j!#ai{xs1g-)1C@h0=PuC2Z}(Lj$dE)iG>Uce!BdAaf`nL> zoB!2}>fC~_WO{SmeOcbzdE2t9a*Mt+&6iu=Gktk(ML4&-GPkHQcixg*_mbR#CAs-a za&yG{5)8=ch8%<-^TJo_PQUK+cy{N?{_b?`2N}<1zB)~NVmeqr5g*)r^Fr60t8UP= zpSng|uV%T9X3oP8wX@HAE?fIuHaIcu5f^ggEWSoFUQx=%Az?221EHGt(DGN+lV9Ekh7MQZYlm(_N@V8n(Y%t)R$XJ@vrQWlr z)}~HKd#7|MJiP;ut{D<&np@tS%IWSa<&9XU<5s-~a1PVevlje9OxJcI+pC>n#qiK#3U}UH-tN}@4z?$oJ~6xmPqXE zh-)6-&~qTVxHwI{OKop%GQONEuJjhqoWO$Z3~{OQ#U2(giVt6T7xjZ(1qxqL7U`XA zSUk!VX{9F~AByy~OvmFfk>=*$(k2h6_o+K27P%#+f?*xQwG2BMZf3Zh;ckWxGwf$L z$nXfmVTS5uhWY1d;&L-AVpzejj^SE{oeVcK+|E!+nfPDz-bp0^F9^NTJ+G;`D{OSR z7vb%oMVEWJM9TBU;-bQ$l0wfV60dTkhneVI1*)NKZ1i@}@|5^YO}&TS#txs((r!!1 zpKbP|Wd3QI>OaZ&>0;m0j-&fbxT^9$J1rC5uKYF`f3l|fM>75tP4%N>Je=EY$H@Ck zoTlJTiOm@7d?Y=L7%-*iJ|i7qUR2-6n5wU zp5&3U(fhij92n+Wqg4H ze>daLa^U}i@oop6p3#$@7dY@AFn*o`Ux0Cw_~$$DS2DiHfxn*dB@TQS@oOCTzc7BS z1AieJlKd=2AwIq>TlzuSTT4&(24 z;0GALhw*AY_YUwBmtMYD6PI|7g}bvcfh4?|Z>xDRx>3rwEf-a>tIR^+NvGapXoJ=kzh-PCMh(JYMN@yO1+>-1!>t=cB&gzCv;qbA}Ir zPjwE(X)qx14~mDtn%2tv2IFsCAQ4wH{srJ?^1U6b#DtsVG+rXH;<-3(zsmT>=OyD` zWBd&9B_(|4f`!$bEakj?X)^yd;PbUJwF1X@`cKS1kIQ9o+MgJI7uRbU%ij&f+>qlO zm#@cyh4kN2BpJ_R{+|MW8r8T$BG5g>^)mB+!%?r#pvhC&rv-SD|0q{f@%%Ck|DOdv z7iI&W!g@#LwV^fo?juGu?W7ANmcBzqmp={vzW`7ATr0kWqG^vwc5Okf#OJd8Z)bcT z@TtnBdE{vsXKK#*`O_@t4<%B7@`qFNxLo$Di&=h@@lN}{34E$@OE3i?{maXwfOA;R zmw->jKbVI9{L`eIor|P^8kVyI`14RN_5OIcy|^9*K9zhzl0FB@C7+M^S26w>_D_XB zgo@A17(4I1pZPz}e#PBbdmMQDA9J0IFFR7X53t`3Ge;Zao&9kS<8zt6i}}yOf}7<0 zfZOFF#zz_dDz~H3VL#)yFrK@aRykY78TB50Ww%D)N&c$&QZP50_Cv-y z2EPM%lJATs#~FW9u`Ks}Pp zxW6c#`{D4^AKh~$w`#}lG5;y-Z%Y1IXQY<{!EL%M#=cxOCc zf(%=m3wFJ;e#@CTCf1LlX@i|bCtf8glv-vXY-JLfpnia4K-^;&K=ADe6#{N^s zg&b!5^Q`B3#xK55%6X03Mfug;jPGTBH9n3o{U8TNDKx32>4W*$5E zfcgKG|&vVhn2AH{aXA7n%QNE|=X)dy(;X z0@9^)z6J~=-{}wc0p>I?+~!408UP!B}HILxdofD^U^X@U4L zmLD2IPOC(gg<8XrD9))Oj&fZOMW`GkS1iK0df_fQwyvhw1MP$B{9Wyaj-PYgqIX8x zLxGL*JhmE75op1vpm0eZR7E!;c2tqSCDJLglXM(vR*Hl8g56R}f7poCl!gL0{*9!j zB$tIFv99J=T6#I;hdOZRR$7jQfk=Bhj_pdz;3@Jht*vYbL_?iMTBcHQ&|O+`d9d5S zS!ThOv>Xfk8*WKUENP7fJJV82gX5At#X7ak#&Dz~z;9NF-($_)K?LYB5J6pa~rB-}m@?*ehm zB?O+JZ+Mu`*w{Hv4=R>i!C1gQ9vc_hjMMd^!FB2F$AvaW!FWQhPJc8QJ|P#$8m9s! zvI00<@&t^mTGStI9Y-8<#X83=wA3sVy+7zrAACGz+zMiJ!TE#ZG^{F`jxc1aAi}o< z4R|aD3cUei%6c4A*h0q>8bLiILRJZaJr6B36d;a-zJP%SC#?E$L}Ckm@=HSsrHeu# zOAw{H(1~C|t;r!Kg6DbshKwT8$_2R79M!f+|Qm)tI zI9MJ6i}flpcVr_?D5`6ki@aWp%AyG(8z)w#*SlU6Br5TbI2e^prw+QrL0FoO=^&}LD{#QKy` zCf`3$&AqD(y~~Rl;Yi(u=vUP%5!`TwtKNly_7k?FdCYC9Ba4&z1u1f!&D@`?HA?7| zW$H=JLz62iY^TslsyfKl8388{?_?vC=t(uFLv1OZPVD^ZBvG<8=@9R(usL!Cai}n6 zN-}tQ7ATrTB91KRh^)uR_jmc67lB)a@TGdg5)R+W5hq|OI#yha3yV6tVr__V7{KAc zr6hTrbIKj&&|6{=y$z!_oqUdb)j8D6?HjVF`hTu=~3kq)4}94@;U5PCc2KNNMEXUQzq1Vt`*(cRs*rg zF&QC(Cq)g$f{swQx&z*f30^V%clgZTWQla1b6tZu!2UgP4HJ%9oT?tgUQ#=jY?u%P zgtil$8}UeMpHO@PUWhp1^g|lAC&wb?9sX`TM!Q!uHA3*fGR)u75)H;;k#!StIWM{k zEo6kaVysuj4vSd1Ojek}?24=-7c!N!mZ`@ehP8~B)=;4^cPn!V1@D;*@As5q8H>4` zxfuH-RtRF|EhbA??P1OBtV@H@9;;3pgzL$~V$V>IU^N%?cT{5;Tcmqovij@N_o9Xp z9YKRE=yk*-xrmy~MNq2sin-XDly>r4H&FcE}&7IbNyxx^$p0a3A%vCY& z1+mQ)2*QiJC1NvfB@8LTC%7hWN5oNIF@TM&t?+%Q=Lxj=qdM=x1iaWn8RPWnRf0ueW9?G4RVN<2Oy1TC#;{toYY_Ab3r=pkC%!!itWH@kK7z8Vq_Cu)!Z~sUB~z887-^8j(5xA?$Dj;^<>m{qGLp zSgO}zr_*DyM422Cmlf=HASh3+bZgq1_ zv&Spyf~oNs@0?I&TGXr<4=%eMYaFq;N&$|Z6-;mqpFCch3;4;foLZVpPT-SUvtlY5 z4T{!C-m^phFuVJNERBemuCrryFQ*&Q%#&dp52m>Ey@)gu!}0w~Z;r)m?LNV!qqHdb zO?fC16OR*0s`1=w%vJzK%JxumKo7*a#8WF+0n?T`9>6EX8Z<{Xmw{rni9#@RlsMOJ zW5$qA;t3S2k1-3Tz;`=DJfBdM!y`LHI8l$kGhTAA{OGdNOUrT`2yHZ$b;H&E!FvSzwVjXLq$xF z)zV9@|C1hMPTDA7KDQ9l)Ja_FSbHz3)7hqw9ia&t#dI?^kJvl%u4vFxJ?xS@0-n;A z&^lh;>0)UIg}i!$PfSxMd5=|&Ox2Av&%#%+FkuQtu@<23v^Dyh#WRDXhf@I*U8*lAESkL0MJ}(bs7+tClhP-#x2o}dk{f5Cddd&ykvrca zga&V&m@|5t;Yed9W3(QZPpc+bdo`${%*lwgCFZ^;g7;)D@hvOUmoU^$jSwT3Gs}n74|r%~*?a^E)PJ6*D+Tnm>;(JYd5XO8l*lfoRu71!#H%QHLm9 zFTQ4?w|3x*JW>fcNPEi1ZWB-DCb4~o7>1_JvIdk)VJJ-yYB0jVPsZ!vO#hH!dy;#X zwY`}8#4e0R#Ft(QgKhkU=(ZL}Qz;Y^2n-NrKJpY zkHY;OA&8d(v4GGD@jdMheD^x(8{ZR3(0gcH_+)Ty_1+By)q7}^aN@OJ^nL&7_^ABl znUboYTPDrFIp&?bp|8=~^RMRo3aan#t87{7hGDfMzk1(?f^G-DGynCVo9m#eozxsVf2smO<`PKV;6jbl;0gJet_P-Ta zdN+>Buii(ZppWxSW2pEP{37u5t{s(D?=4YqH|Hb$Nl&FeqqRQVkWE#7^?nltH?zX3 ze3f6-|LdH82@6#3K~Ye>e~9v*fsZr)cYvcJlzh#N1YK_N7r%H)zMSp93uJqKw_EZn zsPtD-@XIH0Dfm64?D^IER}@ryoDIL=ke7lFJMyddv?yplbAxiKdIK81S{H+KEidtVg9&ytfar~RJh{Hp!63T9_G!bt^G`>FKLa6{#*{O?srih|n| zL88s~hHz4Q88`O)>U}c`-g&a(l+aoJ>yG^Dy)+6sL#(s@e@1@%v|98HDnY#$U~mnk ztoD*A%YT&~nNXg7+r?qReBZzk&ae2=<g$%4hpg|D{XGQ-1IuaAZG~ zU%mh9*v-TZej?f~m0!UE(Czun-$$;MnN&YlB2+$=J_nfzr^>JBb6IHp2_cYw3${+Zy=Toqd?Wv%$C+SF^N|x({ZqVp*i@%29%(ss7IbjX+D@bL~)$~%lZ6f70aK5~1 zDaQ$K;hx?Fy_Jg= zEGjRFMM}y92kAv+Q1h)?(_%pFLBg>((H65fP4rJbGHaXn(~Iudd(A*i@y)%Po|^Tg zl}ZMcNj!8>el~%XJ|B0)BkTYDtXo{O(DMxALVV`oQ#|*!^M->@cE#?Rp4HysFPyz+ z#tNX*UJD?6W*W+0Q9wHUtPK3$&VWBFgB+?|I{9@O`0HVabo_}7{GZ9d|G!}5baG-D z^tm`gy^dwTUyz~Pmow;jR|b7HXTbk1gZy7*(ElSD_%F$Te<}n1t_<}hzjNc`yc#p; z|J@9D>N{?y5Tuu7!2cqHJ#UBnvy3@LRkswT;@Z8yUzkSzP&;F_fu^{}6X@_q z&3M!wiUZ+o^5c)U5rl=U{#YnrMq=@3(BDTLfcGu+X*8+>yCbuNfQR z)Os%rgu+r#WhfFeAtP8`8xF;5WX0ArMLJ{NhLw#?ktUOD1Vh7?vXxr>ZKl7oGZKIa zU|zr3iZ&0M0e?Ku0oJz825Fnc?ZG&Pl3+UXjA=hOjidcCH%m(MneDy3W_K_ei-i51 zq4+j)OR2*`sck==Z5pqZ_7z??*6GrM0u z>%Xb>Ie$~UJP_%LbK}8*t)^>=N8ej-n`mnxgtP|&W{e_;$1q#MTSF+mF&OLV3Q~Lt zsL+C_64-18IyRe|{Gm<*afkd>n9v^&_J-nwqDpLv1|!`;McWi2C%6$2$0TVSoVxtT zylTY?b74uTv8t|i`HFSxa9>_hw#dM3_3~QNQ?gK}==UXM9+@=DO3JOfvXTnK&}__l za>d^q{LRLEB^z@Y!peVH#&m(t6?`;v%Hz3C4qz6r^pTefGoBhG)Hu;_@wPLF{}E)kc)qUY^J=^?=2I@iIKuOFg@3(R;*UD;gYzUlkH;0ouhNGj zl1_6zx4ad9hc!H_BJf8v{5cx`Q4P;a96?QLc(ukN!U+vOSCey6!+%J_7qb4;7DXCf zjmM-T%^B3C##h4YH1GTUW7$ zXIBukQVp-KJ1R9im94HS4X@ggXg&=O2ePgP4S%|YHH>u{9u99^8#Vk43u_o18XgX7 zUELasAeamO=vV*6{lLdXI+xfX2U9!_&G*UHdfrY=s1T zK*OJ{;rDC!4{G>94S%kNpSqp0z?221EHGt(DGN+lV9Ekh7MQZYlm(_NFlB)$3rtzS zFkUbC&|AK~SMq%Qx&OHq+ZDS8<5{D_zP>|w2Zeb?D}KGkFh)i%dKRGIe8JyAl#ye< z8yy|pP5C=$17_sdQ&yTbUPg``u+p^QGIH$uR+=_iMvi^OO4H`f$g$5^Y1(iZId+Sc zrj3@7V>ei7+F%(u=C{(cu`+V(qgI+WR7Q?fTWQ)z89C;$(zJmxa_j;tO&cd8$Ii0S zv|%!G%w?quMEbRNRefoL1ogMlv@wGETWQ)5LH(^XZG@oyR+=_IP=6~;8y~2@m8K03 z)L*4x{kg*WHL|^sZmFl%u+#O9wAYco%8{;cq!&2S7dg^%9qHMQ^qG!yt|R?UU8=rs zI?}H?(l0sEzj360=}7;~k$%jPe%O)zS4aB0j`Y{{wCB~@#8cP%5-<4rUOdszTsM$A z4O^YQfirK%4wP@;ssm`Q(b;z(%Nx5Xi5uTQ?vDs8IT@dgt`(+xbgk1yj~1N21MJ2@ zhT#94;IH^BflJ@@B~JJb{OKdU11Gb6uAlgxdN)1`0=z6BZ}jLU(e5fg^}iigy$bg* zdM`T868cw!I^)O`xy9K8<5v3pYk8u3L@RKAYYU3 zmREe}z=?++LOx&OUmpNy^Y!OkR4fF0lb%6e|Lj?$Q2#C>`uYYDK2YTf6FbSi#87e- zJQqAip^D8@D0!}8o_T^NML^;wu*Pv@NwJg`ev<{7FFeJKSc+4=sR^8ErO z+3bc8D%hVweTQ}pTP4$7wnNt*lCa~iR5FoKa-XQYOGHUDRTP`9&~U+O@D))>32Lrz zt<_xfgu$NXQu=q%oiBh2u;xR_$I0$X4i?-oxN|U$mX#dC7Kc#w31SkC@+f2{2cVKH zD%&n z1t-i@3KKWIo#;8)chlRhp0ZnRI!PU{=TbE9acZ`?q8hVHR#9Cmj*w6E=N3{5f*VlA z?#C@*Kc=jY(A*_?4+5Z8O5&!IzC_OnU%&UnSnTexC}Q;WBfxo2QjlBnXlxpxj^_|Y zV~6%Wl1+Tj=Ux;IeZ(A!LNC=$r#GI0euyS0c6~%vsWyrrZJ^Y9*Jd9qp)`94_v2~y zXPN0}XDec0=Cr3+3HV@(!WV0bf7z*mar- zhfS%6YVUqkE(UpJl?__k+dhX1q9GvlzzexP*RbY4EK9?v+ObF7ipY|-^dYzsH}6nY zo^~&u^*LM`N!=&imSZJalD@v1P8jhIN~ke*j zlDwusL{E4EO1ACJnVotgcG)wRlQeUIkO4Hs06IrP6zd~-qOa7Sv?E<^@{Tu0N43rc zOFLvGX_}6PqFl3Ne9TR*Psz0(K4NIrY(f|BfI0UeyzKQQTK4E7#l*_H&o?l;84HFV zQ&y_)PqBLPCA_5iKJtu&_prQ$VU^)L~!ZKM8xkEPcPP&%58Pa?%V+%lUia5gh@_^$c^)IOjYb zQP~DkvwcToa~8NSU4c!hIiqO{40F!p>)`G?;X++00>BnS$(cd`c_Os}Dksd7Pgv1b zEoiUCLf_>rhkXa$$_1NZ`7%wABnzxx$hZ4_2@xX(rRAv+V0i*4t)Xx?ZfGc$6Y+wb z{|Ns$){j4OFda&|Mj*BCru|0GJg{>6rAiSl@j4tzgfUTxdNoXhJ#RC zq5pshQ1YKq4iDhThp9|CY(I+Ih_M%2;+Oy4W+lz`IDQN-@c+o2$$m=*{M ztsJQ#W1EB13XoPDsBWolre0SJ2}jWD;7Di=(ON@E8bE6kkNNsH0R8+UZWuH9=RXtf zP+j6rb%{SFA3lNU%3RWf_!7?(?l?H=5=ruaCto9zA1c1CdVTc+qMS$6ouP{r15mOxG(kF?z z$&Zm1eTRIy7szHG8;{ZYb}W8~LQ9%hJPiVEhalS(!FFhZ9?}?aDETjFJYRnil5p0c zyKDJYdF%9cywY3%D#zaRp z;4()B>99wfPP!kVkFIR=`lR%kp=1qJE`fpv&~2kNkehfeVgLiQbUBWkCQ*&GHaU~6VB)m8Y3chn_&Ir$ZtN;u*Hh_+R? z8k8%J(F#<%y~$!$Luh4D2E%KL>uoeQAkOYV^Ld;fZ!iE;M+U!YEgt|01?o_iO%lRL}jzVfoEVM+YM$dzg zgBhaM{M?Q-yjn}hLKLM%c_EA3qaW)8PLl=O-;Cgif85#FpKigU*sI7q+~e%PIml!&$GNAUzZrc)UPJ-$Li5p zzxDMUborM2rsrkUF7LVxG|8*JzS=yv$u5$bj2d1zFFi-Io}3Adrv6M>V9Ekh7Wn_x z0xtZfMP2F+gS5b6>jS^Iv44Y8h5H@&h#y=KGcn*gCBKtN%R^Hl5<_1BpvV5x@u4zL z!-pQ@pMlQ|e8glGf1-xpa7Q);0-f%qZbRP0gR!{V-`U=}b*XWgb?L;`o){6g1OxF% zbm>yF-o(%A=tp~IC>#q$}B+u^9Gbv^xM(VLEP^-idhHXK0y3%$tZeZbaWrm zuHTQ2o<#ayj7)JTcO39ez#I(0MZlj6SP8fQkbY0U91oJhXH9HuK_ciBw#OK9t?I5U@_qRGy?*B60m~|iTU%bfQ5kj z09OMZ23!xA1pF*u9*lh$uo$oi6L}xt8o&;~>j7^CY{cy10YE?CFyQ9_PXIm)Scrxi z1}p_!iPiafzz)Fefa|a;vKR0^z(K$y;8DQyG5w;(?FDoLeipC_@CCp&z!_L)>;zm0 zcpqRl;32@f0AB<=1egQIcnfeo;L0~fM{5AL1GWKv8E_}y_W|z%90oiDxDTUBjdWU z(Qx$^y5`K7p0^tvjqrFXZgjv$KTMo8)5RA4eB74f)9@U8hdN|_;mZ7?)dgp4&D&vo z1!?)eYH)00_dNCF7#FTH`(-8Kz|N&@jEd^zs9D|fIj2VuQ&yr)NbV|tk` zKV;{B8}x4EFVN{LZTfkLouvN~o!)KJuLS+Zar9dcI_Y1c=MUQXw{!lhbb8#T-vjy{ zE!?EKGx{ygaE>PI&JA?p7m=x+e#d<_9t+1Lwz{6m&i z&V2am7cq9ElS6vY?`IRBXR>n}=y!vDnXaeb)^jK5Uk5#1I~OC*da?uPJ-E-Ar|DTc zfNs;5n-#ES4MWxo;JrXEYn`OKfja^Ei=dy->8Nkk3$~0|7`q<8SXig?WUUYkRNr#Y zp98%{r$YvLKGB;&H!wyn*6CqeejM}_vbz62{PS(eWi^_NiJl_J3 z_?@)syDfGdp96g#=$UNvHs}wJlm9$~{f9s=(sfvC%fAxzAA+9FhHm6p5BfJi-=gOy zKkbk*<^xZ5JPDo$ai7zKK&0wHeBZXq+6P$)%onfIWo0!A2C~Ir&<}xrg-+MvLK5_0 z(5XzT3~j8*!*KjG=(BZton3x0=#PW$)9KLhMzB!%KG0tSJyR_206iaj_bGJbk8u85 zLBAaIbhae<`#}E?=qddH`9H<^4}-oM^i29ELBH_X(NTO~)~8Gdp@+Kc)~T}`c01Ocy=)Yb?yaE1*R-8Wq~OROj%&c0#g>4vcQxD zGFd=uB;flPSc1}}-m|CHpiW48q;x4fy#tW084}GKO!NOpEn6Zq<7L=SZ|z zN7Jo(2bp^BAMF*;h3yV;z4`8Fgl_g?50x$~f5k=nE_7iVOk78pj-|P{W-=olCy7h! z0pU*UFEEY(XERQQB@#P0;wt7Fdfr197iXwJZKZSsJ6pSn|Gsas;I7&b86 z$grE?4u-oK?q&D@!$F3_439EA!BD--u&TQl3whm6nv2mv}Cfc$FhF%tr4jQVp%K(c3{Q z)8ey@b!qWA#`d)MT&o{x;?W1D8LI!J;-`!KO&v$~*>F|me>yE2-md&M6@R*+`bR4M z3`6y!R6Ly9tz+bUHXi!A)8g}uy=mGv+bA#|NQ*DD&TY_3$;KP3RoqI&V^_0E$H@C^ z;{%3zFTYOB#?#|HD|C$FpKZ9;q{E+W#Ydf*jcB9dVk*AKNY^j3jdQFxmdcMP+pS~d zeYW;KaGjcMohD&pUAWCLUK_PB@{Yn#7NU8>yi^K>N5b!L;HfzXuf`jdi=H*PpwIrO z#87#3(enjY8as3WPjb}zWQmuqZTPs1vG&{pd^-KVC-`R?c~!WT3~ur~e`aX-g+(4qfd1%9S6qY5{;RGhvL1-bCX{lY4Nw#qGM{8{g8q9 zXFBj7XZ$S2EBU>QFLL1Nx1=QJJO}@Ts_!h=@IPf-^BR6 z4*WjG-{-(T$@qPYSM#|yfTy_hjbcMw;&~PB=3@d$cs1Wv^I~+bv~h{%w=Rq$=dF=) z^uOui*h(=$9XlSKb0L*Gc0B#KkUw@DX%%wL9y9L57_a8>N}o>)Ib+A2dw{~1N=dBbD3DetUvBB8+$T$L}%zNU3D( zVg7koaFd)~ZkLN0zm4(VE&@opbCk{l zp2}@wIm&KP#y7KI4!g#qjK7BKrR@0+@CTC9%lvSAaox`NgO2|Gb>L~dbBFo0Z=3l!MLUAdcb1+a+y`EsdQvLc~ z;7L!XKYWY%o#Xq^!(48ViRS}P<<4OJ(f!4B3FD8jpM)5{lks0D`&>L%mAG%7sDA(Z4tA*Gt%ns zG~4hirkOr>KoaZXgo^hcxqZDueWk8Z;;QvR->+1At5 zwGCOc6n-uhil?v!I)gZg4hK}l>DW2?T~ZOS(hBKyYg)7BmBqFwQjV z3E-eDL0Tw&e&vUv5Z5k|i$d+;NEF9a5l5wIh9XoPl53XWc)f5BonBX0=0PQboBTbU zaT7n_+H7`5IzxeN@>sSyPbp}@sGx939@NN;M|4!Fzb(=&vy*h3ZB~Kv_=3GsOMf^X ztE&hFa0(nrO-o)Bj>LLeV;Sj{kRR&8*;^So76&4oojAQKBZH^ZyQ03jDG&{H$1^fj zi1Y3;k}HF~aU5wDY|F^8(7$zaMq+t;G}xVyS`i$V>?t#;ZMKCYT_JyGMk$_!U365Q zYSUBlg(GpZHF9cs#20*ze65kK@2TA2IDAWFHT+@WifNt3Q!aT(wQx`8ICWSg8PRFt zIDIzKliATc%-S6dZZZ8^a6DY*nv`#fL?QUprY>d1*f@^kQPRQ@beryRYQ31HZ3;zW zaWfL`9EW$IIPDSwPti9#%opF*Jx&iQlU%`Az&{=v7ut$L_M*W}neE4gwno8tO0I5y zG#EZ57s(o@0_Cy-IAih@jI3JJA8sE<9CO9G$1SwNDipmx=+7K{Jd3y$#2|xX2*+ty zRWzMs$W}pwZwtoZu^1@KCJZfGa9UxT7)yd?NQA6%1bZG^XedA&3Eha11_!PBaZ+L% ze)mg53Z+X!A;Vl%w|04T9kTSanzg1l=+HD*HdfbrP4Ai-I+oSMIm@Qk$4FmIqhZ#3 zd`)$I?F!IR2Di3UggDsMDq-!)l}%n8u3Ejk&dW!TdKTFu8XwGxV?8q;+nMp?O|REW zqZmsI;+V}vW&;k@?BYWntE$AIjW~aiPF)m-ks{JYgBT1koS6RFhKUr4LZB{+@p^G& zY_r*fCaE+VaVjhifn{b5nLDzL<`uPdt)*TsMrF|ik!=$z)9c+L3KEt0pEw_tO{dPg z#C!;&`ve=Qrc4}qOM$-3yhi-6TpS!bp(aN4c6k7--*TJ6fmRraQSfrC*RD zS8L{ef2~nYhb>c2a-N-BRizz9E2-){TW18EK)jQUP;REwoX)nTcsj9TtW!kE)}*t% zd&1Vp6~x)Xm?_EN=~<{~5{Wpnuq(0!Bj4ZUb6x~)6~b4TP0KlaD@UAwspxcZF)l3W z?um6E#$f=516PpbagHl@n8R#~Ma&M2+H?Rq@>QzgWMchyOGC4$ndYaH5f)Mg-Ujqq zae}p0$otzF5LP)f{2eEvrx;$kG>s>d;#^P=Mv=}9Q=~ghKI6QGBGCJ)n@8!Uwbx`g z#S&}pg5vmUd=SOS0c*O~Vy?4`9emb$dsl@1Ck*lQ4) zo=(W))vQTdCr;uHfj%Zxn5)Hnj7BA~SzwJ!=Cy5|4XJ~O=z!%k(X4IE+}s)c$Ln27 z=2;XCin%Jry&$%^0zr6@w_I$-t%V^)_ypJF?T9$)D+aK!wH3Y(^*n(NJe1&Fn1B~s zC}X@n0EHn1u|CD0p?P>~mlL(7DbgA9Hmq!HiZq$29RSZtf|2fEx{xxN^|#ln)WXhL z>12#AnA(YT@T5Aw-H&K3FS`P5=wYZ0bZj;^Ve`+9U*%W?HrKCETXo{W%j9jXU<|8Q zy9Pn8u;3&`{lyTR`Z~c{yF=>JfOLxo3OGpbh zVp~<%_@&KQ%mQepgb4v#(b=Gx6(6V7y1P|unUvw1o)+L4T1Pap)gH#-e2YWjIG#yj z-%hO9@Eo_cwXW6U6?MVXc#L;Ws4{J8R*VOiy^b}G*j%LmhgXVE^@t1;!}0yg zY>maVcAwzVQCgG+zRFM}CLSl0*W$U?7_WfYJ43AjGZ5<$Ppx1DOj{aw0G||V&>Y!X z28z`t3c=7(?p(W#8AINWCs43H#w?fu-|Z0bd_qwUkL(cPL_Pk_c*((%-5+1EF!As$ z(2?2!(rgH&GEhrTMq^d92HQhn8DlBVP2$kE1VdkYFm5(?tI1O&I+?+IA#5k)L2bB} z>U^f(^i1rbBBsY`=_S|yDGxFyZ4|JcTZn1uB(8Mqy%*K#v?*j)Xo5zu+>Fg5_Kv)3 zn#^<$yX205r=l&iiI;b#SlU4$ui4}i)6_}cW0fOQZ8OcY@Kr2Kn1WHP1?a17&Hh&L z%pm2V)MTu9Y)d0SAl_M+VSjtN@D3eJ(%Pv8p=*sz5Xf9njF zc3o6}rbiGBh|-PXYbIuU7rw|Nm5_t9r*iBz@nmii>pR3SG;NkOp=1g}8G=w#JRE#~ zydKW-}|4AkIG+}EvX8+Wzzbqeut(I^s@^+|2odEp!y!a;?Gex3~L?v)q6h_bUXN+ z`9B6aj%|o2{ndMR6y%==Snt_!>hDMXGw@OQkFfpOVu!zfP{~Lg%cjT!2>b)ci`Z(V-hKf(YPXSNw+)-)u{t^ZEaz4_Z^i=vY+8DqM z*;M6M?>SL$2P>?~SNT=_zrgvIvq1Ge6b03Lh$#O#_&D=_1vn}~$v50c(B&3?@yn;w z%h~?-fUM_tyCuJZN`ED#kZ%PX_u3f7A6zXTVAr_Ti zy-!9#I%`M2oaH|YqMl#9cSgZmPuFwX{-dxjIr6Lb(c4cUe98~L z4IJ4|>$OUTPUT_BW_PATH - echo $(date +%F%n%T)>>$OUTPUT_TIME_PATH - for n in 32 - do - for w in 2 - do - for m in 1 - do - for iter in 1 2 3 - do - export INPUT_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" - ./clear_static_ofccl.out $INPUT_PATH $OUTPUT_BW_PATH $cards - ./clear_static_ofccl_time.out $INPUT_PATH $OUTPUT_TIME_PATH $cards - done - done - done - done -done \ No newline at end of file diff --git a/test_scripts/ofccl/clear_static_ofccl_time.cpp b/test_scripts/ofccl/clear_static_ofccl_time.cpp index 4c49834..bcefbb8 100644 --- a/test_scripts/ofccl/clear_static_ofccl_time.cpp +++ b/test_scripts/ofccl/clear_static_ofccl_time.cpp @@ -5,7 +5,6 @@ int main(int argc,char* argv[]){ freopen(argv[1],"r",stdin); freopen(argv[2],"a",stdout); - cout << argv[1]<<" ofccl : "<qS3QgHXf0zt+ZHucbcP)cTl+));LsZK?Tv*WPQN zbLM0w)O74WzigPSv)5XC?X}n5d+o=0>^t1Gt1}!9O@&PDA2flKE}$wp@u<(3*~HGrUpKhdUCXhs!-ClmdEu_>>E5pt zVl+Fvhs&p+%I_0MM|SyZ1DjK8;QUh6Lot_IL24%yuILD~l$T!7(Rz7DAlTh=c~3?8 z<>jSCkx)^Ipp#yd2UXvib9n zvrx&PJc)+_rDqcu@$+#-JhJ@%mT{+J9(tapU4Y*r{0h&#=bT~xGo6w9W@ofDd-Lb- zp0f(*q~UT1pPPj8XJn8JKQ9IUw^QKHNFj$xmrQoJ>w6 zg+3RhDA#BT{K6FZzL`SL`%>t0OA7pNQpo>B3jME1!GCcI{Ie&0Q{Qpgg&=-a3jEJg*z@y{e}=X|tL&1(lwbQQ@E0VJ{~wS;{CjVe!~(=A+{SY1 zZE}WSAL8$2d_MDQ@DGt_tM9VQ$S0I*y580q3hI%lHyqVPn)Ie(-QBoMZ&S{@-KE17{NAL27BcY(T zBM{xDZz;A}D6#IRvQ5KUX>eqqDuX=qfA z*{(lS?Q{CZXsIvM9p%bH12(FzF&h5sdfRks3ofM1=hGwPL0p>N9NZc}_6`0>cc-8H zi%+>0cop9*y085feX}>vp~3G^dle@1M*Tg3D50njo5TK4mtWB~2dEKz8XiX{Y3!Uj zy-2)f)hc~yQL(nBwr1t34I6M>T2xZ5;j(IFjqWO1YNc4Oi%MKFs+Sa%8doJnW$>Yy znDu0fzghU3iTO$<<}!qp|1z}M0-r7TXy%l|bDb=}3}EReg99_3YDCmHQGe04(}@2` zW^jO$Trtt>@!N9zevDYNv>{IOU*Ee08DwkEb3A&7aCy=}f*M~5Zylf2_)2&-jYy)#X~HiQ82Klk^_h(*<*h=21&;<{1g8a$A>9at z7CbivK`XZ4t?P~o3!d^;q0)j^bxAaj1&;<~gnA49R0FGN8!UJ4K{ z(O`|xWx?|jRnU4Y_&kGK(|Rp<>QgG*Wx=1JkbpZacAvqqdYUdBfc8gbfi86W*OBTgGF;gE;Qn_aWX!7 zh7qR?lkri95ziCxcRx_&r4164--y%32+D87X+s3%H{!Grg7O=2+5kcMjW}(5p!`Oh zHat*%6^HfD71poTg#8h3uA|y8<8`*U+ZMmt7B92KFSo@nw8hW0#pm1Nr`zJ$w)kIa z6ZQSQE&jGG{+ccRvMv7Kw)jtN@gLjbKeEMtXp4Wx7XN2!-1T-%?Ae< z)edBrW9!K?aQc&|D9^yv2T@%U^Y2BH_jghh7oLIa#|bPy9-WV_6{Kr)ty3mO^3LxC zyLN~n_`e|d%kCy{$p@a;nCIZ{uJIf^p6PKs?RoZt=sXB;vw)n5k4AAQ7&$_Ts2zJL^gP#8R^GKoo9Ypl>4Z?k($YnZql0C7Z z_!_ia@QgqelP6#DT+KX-1W$s1*we7aF(isVPhtnkKE}L5LX)RphjU3J_+r0^|D2G9 z1_h9w_o$Zl#9&{%IJV;@Az+7-GJ!&`%jhPKUM!;{1VH^c{F7&3hsM-HGFr*eZ%Jku zB@F{))aZ$ciJ=`wK!a7Zz8}$5LEh)dX%6l938VXk9({wSE*Gdf&POh$oP{K33FPz% zs^i%3(2ik4AYEmN6i%TC<@^rW=2Q^+1f%2FQP02^1$khHEX?y%o^trtp&ic|qUb8q zmidDu>U)1CQACEQK2aLSXfYZCZ2tguQIX}wYr(e$N_t{s|F6^-_HPgtILf*7@1QG> z4;ry5L-8W$;aPqt@7}@fgHcqKefmL%r$Suh`) zZ3ACqJfwOwhWGr~?I&X0$NO$S;pi^8^Y-J^YP&B% z^*&0~8yC9F|Kl1eYuPt|g6&?U7z8f>ap!*+!k(a{Ptt%N--Cu?$t8CCaZjv!%+v24 zn~dE#8AXhqes~4PB#n|&~a((C}Pr_$`1sp$RnhAA$2$`c#L0FW&@0+|Q?l?wk1 zm4CM-$D?S7Q|0(g;*Fu>k`RxSg2G8bU$zJe*o>lVZWyMX6IbS=rlPJyiOpjya*Gn# zUzy_>sCP=W>vN7>Ag_kv_X>>;dis&ZEYn=H*o1PZ`tDcxVrW!Wv9=Af(V>8-2uMBn zO18%_Y-?}G%7Rg)6ABE)?}7_Tn)?u%5?Aj~B~3bIRS#8)J7l&tji|LvI*o=EYmR&R zZXeU4AD31`w7*VSY(a{=6SPf~WtdL*&pA?nrG47C-EnF)dBV-2B_LQBjm?o^+7e@rx2t z?MEBYEY@saA{Y~ImGq(b2Q+rl_&^#A#m~ip$kSg8D^XTQ zJh5LA_Pa9o{hmJeey2)#I5FjKiAQtwua&~fY!u*?CHM|=wChQgfZiP_#eU@u8sY!HugsRv+rWQaxQ5? ze6e2?W7Un8TupV&nL#Yy>Q_l857>@Cr$Oknf3hb4X(;}j z=#Yn$#{c|B*!Pk{p4bb(3}OJ2Q&Oyu(1*eP#u(^ax^2XZuE^1wMHKB~0A64KP&$Yw zP#WrI4j@qd%zQO5FL#fG=PJxvQUsVSxeBu!7%|5|D(fsUJH8NBLo4uDTY;?h$^IC% zZ~Ec`X8Voby8mgW{T0}$P$7Y_JXw(xRh*2uZo$T`*wo7D|lYfcO?Q)rKLPUp8ZQn0*Ccg4r!P%YpxK)G%xy+|}K7?nJ#~aA4%bgsbTuQ>uL_S*nw?P@4H7 zJ!B-#7K&h#1bvfeFv-z0O-`Y2?g z*T-d>!P1Kg7emGa=mi7RuS8FnLr$HB;>03fg`BLbmt?9XT7gwOK^3>$J(hPrEvK{N z8EzgcQRD>n3 zNtPsEP=V}GVjjU_b%V(Eh*5i2xrd`GfDtx%hG#bH8lMzVn!*iAx8RBWEIurZ;6Ca^ zr;Iy~JuV8-ceujhDJJKtavXr4F^7;5h(bj!m#Cq*Q#2zQcwm;Hc%{hxS9H06C?erd z{CTMW@_v{&k6lHkCZ^0csX0(*q4J3q_$~&i_@9B7Jcr_W$G{`^3+B;fqT;X?wU^_t z))>ZCk_3n`f`;oDP1sPwMx8xFiZI76H5)P-|Bn-Q4U8%k>q&GIQME$JLy&_RqNVz| z9;xpbOUQgmF}be6VJd!dGy23LKucHrtmC7M=sT}|xys^*&5@2i@sVkeqC1D78Fy0P|> zzd((g_|IO0{DXhVjGpg$k<-`4#xd!uj=fPeG4I!&zC#Yr@|U~cM7eTqxRIuJRX0`X zhc<66$~%8MWhe5ZDi=+X?mHQGngn^~&x{3TEHGn%84LWKEa1R9Eh?xhOg97;n>cvm z#(W2-64$->i5D)2nHUgG%5P_~q@l?jiJ?aT=(hiC{3y>;@T1%Kr{OmTKQZmbpRnfD zoT1G=Ux)KbrzS6={z%jr4Rrdi)GjrGw#Dy@hQe1~sn_XvmyX`h(*wbXKOEH@3o@^| z3;NeX?vh_mOdP=VLn9LtuL2$bJOQ}mm5GT50q6gAV&Z)$`Nr!L6J1cG^bJfH0P6tf z0q+AW1>6GI1o%zBDB#n8y8vGYd;~BX&Qb^$&PxD)VafDZ$H9QV|Q0h<8F z0KWj3k9P7PU@_nrU_Ia*Y!N*IcpKmeK=m$+rwP;P5MtjKO1zR zucB`@>2HC4Dd^&TFh#%4q|bpqQ_+{4gbwkV=a~nVAPOg(6hWUC8G*==Z70@d)T+phLbXqs1)8bD*C9Jy|)ONJ4fb`X>Oh z9)NX~9vL1Xgm`G7Nw)A1&z%;YJIs7mz(1`6&mG_~Wo$9Zu@Us|f_|};ev_I0Hqajh zo$O+yzrm#M0sV2%>2W55USrY^fc_BZ$;wG~e2M8;LYV0TX8IGLKY{dlR{Cm_eh%u1 z^j~hJcbW7RpdXn+zm1@i{zcaGeltDom{aEls{vQV4(6H0R2oX ziOQ|?PBZ;Wpf3SES^3E3kD&bLfc`#U);Hj6RXcE_J-(h{jUuAT6r>72?i=}Dd?BdIB2Cq2DN;mH-Wwm z^kr6h(3Bqq-4FVwtn`d}L;fz%zY2OXdpMEi5zu#nzQLNF%3$$@N5OL#JgIDc9Q0>E zPt`XIVCMe@eT6lTh*^dT(8oc)$V$J_q;CM7<_f9Weh=ugFgHlmc6Wn*4(O@sU_a;{ z(95j(cbNGf1^p(_7g_1ooAl$LdqGcTW2(af%pDp*Pvu)GKyLs&V3nV-Nn}9jX^ygI z3i&j5`5fr0t?6%<`ZGFy`TeV})M zp3onV{xh8Z2(Po;kx^w0|v6L>DxEPqC|D4-Ll9LzcQfG*zMS9Ks=5~a|{*DV20 zJ@W7qVs&o*=QC<@3qF_W&UFrCxpNEe$*#^V{-;@<+={;0YjP`txfNBp#Z|e5D|4MI za|>4H=C91n5#LLQLQW6lzy%orkJg>J_Ga9>b7X&gw)WkOXEI-&r9CklETD)VuD*Ps zW5HF|YuZm7c z@xak{92Pz!^O=ho3(Qzx#sV`In6bc&1!gQTV}Th9%vfN?0y7qvvA~D5fY@NbH<7V4 zrJ%mEr`D!+NPDLg6rR2TNMViyn&y-jr*pjLN_ipH>9|zi0bIazb*}|)sEMGy`$u~S z6tE2@@W)wf(t?_zEt#(p%hf0t>b?L{RyQJuF}pA0BxZ<%3-X3Xdp@ z_--~VZsm%&(i682Mf_T(zI}>UN36PKl{xSkG`1!!Cxs40kfz%kW`_ zgA9imjxZc!s6J+xzfcpw$*`DVCBu4#n;3R6>}9xoXu1h3drAQ4k(Yp#%MO)bD>!1}$@tKW0|M)}1=e1Vp%UuJ4&8GbC0A6~Z0ijmiumhXXEshP%U z5+>Gx%Pj5P2@@l)$PHy+x3qpyB8A)|;d^a(st&@d@kXVhdr%JOvp*~`lph7U@8C#c zhfd&0j{2S~@lx1^pF^8$&)vW$)Bh2{KUd4C#07%N_h;gIu69Z#VNLw2;(D%jY9(P! z{JY|Mf%ce9e-h}xm-XjV5;y4}!^Moxuat3>?{dbUVZ+l44kTx;4Sy5k=P_Q%zn$>~ zHvCr@e~t}LuPT##rwu>C_{BDSHpWNdFSOwoGk%E;znbyIHoTAVr8fMXjIXfa_cDHk z4L`*AN*n$i##h_$ABTS={XI53oeMztS{uHN@%1+R=NaE*!+)Rg8*KO&8NbnnpM{E} zd^g$f^x8k+TW$F38Q*Th-@*6}8=l@#BspC+{Bw+t+VCeB-($mHAnr@d)o!!lYZ%{a z!{5sI?Kb@VjK9l<{}JQwvEhHq_?fJ2_)gwd|S6YsxJgdaB@!#{i{tXkjDNf^ z5kJQGIpQHDJafUqHk>Nuyme_J|8C&(wX?MX+j#ml<}c)YS)7)E1qkVLHd@C^pA$wMoO9AJx zoO^*!#{W_Z{-QIaoZZW%fLfNb7x;xJm->D@n!N}IfKMiW4v?hJp$f_8Vg44zKh5n^ z;h#gn=VnZv_dd-0H*>q(llRe*KC zll&LBzbKxE(cq~+Iu}W9RgXVl{?oXI5GC!KV2)h{nM_YgY7VtFQ*~h5}{CqyvYlC4H zz~J9SdrQPo%;A*z+WpUZ#=pS%BDn}(0X~_1{vCMIe+L^<@tlW&lJFT^FXd8<_6^`k zPkVd#9`oDB_jiC-GFgNG^DqcfKhmz0SU(f#DRRoUiuFhL7hwtGbIW8r!1x&BkCaHn z2F7Qgf0CT1SaGGpCm8=Mw=3@6+8*E)4~r0Be+v9BSr>76d*Sd&pMT(XuG;M= z@J{Bjf^3Xm#Q!ap54R>lD~)SbM!vp{`R(oPKLvlj*2VRk&HOJgzuiAyf%>L=ALsT4 z^NO&G@gBB=@~dw%-tOP3;rB_-=eS~#oe1|a-pPJa)%yp)Cp(vI9vDc@qs)kE5}^h0 zWOlfd`7_xMi02iN;tk-PGHJr!KQRAwY-esRS{v$#^8F;Ye+=s)bOG<=Yb&^$`ETNU zxp`^7W&B-$6qL?u!9eou?cwXdD;}AG|N6qwNVFSo()u*>Tsb}3sr%?;xd_g9Y7Oaa z9ibL)hu(@8pCY=qyGQedI=edjQGaVuMR7UKm`cW^&UY1U8Viozix^S#lW3<+_yer`M`L_ll ze%%)e;$X9GA5Pm6q^07uEHAW$%r=QE53~hCVVqe-92L4A2vMGhu3L^X_JZAXgk5ck z3nlSy_I7tfb-dVhi{2IL2>7q@-6sexMTvaHZr}<_mRn;0Ujj46b7Ls=BI1 zUpUYeP03Uy4!uiBuJHFnaju!aH6_PV@77yV5=+~{{;rhNGXIohSBXw_vn?3v40tepGp_9uaKL20I*Am+LSLN%Uf^WGjhBqjhV$w$ADwRB>TClrg ziZYZ-Ms%7ej-U;7r*3F2X6*|5x9Hw2I3q4~NlG_|!Vr9NQx`L1WQvC4QqqDUbepaz zO1+GwZ4QJZQ9Tswn1XkyIPww#PtrGB%op9(HAN39kzD?W&pQV z6R3-9+-{s3+oU(5N-FdQ91Y7uV2NH$<_>M6$wf_VOR?LHQCU<$XxsGibi22Rj6@;+ z9*3l|>C~Z@m=s}jpJpXhmxz;Z$leU`P;d1{ zy+%IMEc_}_ct43Unn${9V49QERhpVJK-bcsf2%xqm!^-?vbVd!=w6-mYw46;%M_Ci z_{BW56SL_64saHkOs`Kx%jK5{YPffmqj$McA{?)~4E?HRJ)9fPbJe>s(0;^rG>*JY zc6@O{zaT}{TFm{?TBDRsT&A96KR~&%(sClLq^d)0?H+I%{!Ugxsh&`BI^dT4>GaOB zP7oz)lMeLm4jLnu9|sI$rX-!GYpJ41B>c$I&d?T&e1B7$bHi~f7rshwT*>ZRHNs=|XX) z2y(YFmQZLt>1h3~GAv^;mopY)AH@nm%)G^939CJ0AUQTdx?4t+dpW3-zQH%x+p+W2sB4 zUD-fk1#=BT)6;3Scr|OXtP|6?Ltq_~%Jgf+e2hjVu~}e@O#1b$9rcNWi0FW&G|{YS zNL}42+mGA5p3GAo_KUeH#yvl_xqN=KB6q3Sj9U*w3ik=F^zDe)$}0x2$)y!-AL_Y$ z?YJ|+yD&aCwooRudLI;q6!`jt_6*H~TRZKjb&a8ph`WAuLu05>PwW7=Vd4*U`IEVn zbe7*-uTl-$XQk;FUof>3>)^CDezPCZT3&Vq+R(#L>ubM7-;B*a(|?s>5!h6>N^RAN z8!zeGTEQ4rr*;keZehVRdHrRRJ~LQjc1Y_qAlc%9oX;|NRhmnxev`&R8=}NsimYPH z&M*LXdT;UTa>2P7+a3NMpC7lxjK$}4d;|3|+Y(lERE#fjxz-q^djVU$;ntd|2Aiq8 z52g{x9T40Odm#2cM$!Lf7mlTR9dtZzoo4xQ|=YQrqHki?U#9JgId~t1zu@6%j5N{iCKR}lzB#O;LA8r=7q)GA>1jzEh~_eHwJT`O1t)0TQ3 zz|&$4nj;&_K(X3HCKx(O?Q6G5W5`Ew2MX55m<3bd*$&~)r{(2v%MKn+l;dyom+UOL z`Qs4_9XH>6?TH;A$%arO1J!goDyyu;-xdf;A4`5NjYHpZ41I0>sNU42CQqSoI-U7a z*iOiU+Gtv;^XXpQHNBgPm>#R8mt6lR+{jGZC}7;T5YyB&u5`@37uD%3Q^?N1G>u|3 zGd7RdJ94jU)RW!pk~;#fvev+6Uf$_qX$OVedZR~7Q`5Z1Do3W8CYopAQ7lZD{9&vG z==ruLZ;QBRkZ@Bf9V;Hw((s$j!L0%G;s$@DyVH*~$*NWQ(xT$@jV^L|WqNIT@J>ok zV{cXCdzvd}nYzo5#v^yWg$woHI=#l|Zb3tuG#R7yxV&4HX6@CWiagWdYfFuNQ8@2( zF7Zt((?b|)Cr4O%sn6K6G^YG%8ae_!v=)}_Qp{V$V>9NW+<3+WwPHBuP|MAD-~k)9 zP~yYRaB0^?IcT~CQ4cTOARaT(+dA6YTBYNV`_lx-Ez-B#ee7Ws5%?!Si7W z5gjzNWiKV*tFDfyRzx@IwIW;=wT1X1;`eDq*k?ynco zZ%_Y8&~cPQNa?S>x1%6Gu5WyA$F9E@=}*H?r9aC0E4Yc{R7SF$O0VA62mnVcD!ux? zkAmuZK41~SZvR_>rEle^^z{BK1qD4aYUr={6#NY6^z9uLSKl#Fa4)AP{Yg)yKclq) zT#!vwdiA{%1$$ZHS)4$nSLMH-)30Q@`Ywus>U%_#{w)0L=^p@&vQY9hCn6M_;xFES zN(}b;-vhEWz0)cA6;%2wDR>h~1O@*EF>8ADJr)HOA1A{b9Wp5Rur0m%Zi|A}GdL)v zN~hpsw)EFg1?q==f{~x5s ztJb1#P!8&QU&9+IX4aRCneA8UkqPDLT`zVM#`gzCIKARal|B#YsqL%u>ialxl|S2$ z`Y#0~Pql+@14s5#>DBkZj^9Mw;3uLbsPqaJfNo81yf3*)CQ|)eiBRcO{5&KkoXWql zi_>kAu-8`ql@e|IUBvkp8yI;}ut`FGt;(R1t`w}$mW7~uHVL57d(#x?{XHE&rN1iI oDZnY+acP8o-STosz39ahm9Jtv0~g8ChrTX3=u8`Hutl~12iOU-{r~^~ diff --git a/test_scripts/ofccl/run.sh b/test_scripts/ofccl/run.sh deleted file mode 100755 index f7158da..0000000 --- a/test_scripts/ofccl/run.sh +++ /dev/null @@ -1,47 +0,0 @@ -export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib -export NCCL_PROTO=Simple -export NCCL_ALGO=Ring - -export DATE=221221 -export OF_ORDER=1 - -export TRAVERSE_TIMES=10 -export TOLERANT_UNPROGRESSED_CNT=10000 -export BASE_CTX_SWITCH_THRESHOLD=80 -export BOUNS_SWITCH_4_PROCESSED_COLL=0 -export DEV_TRY_ROUND=10 - -# export SHOW_ALL_PREPARED_COLL=1 - -for MY_NUM_DEV in 2 4 8 -do - unset CUDA_VISIBLE_DEVICES - if [ $MY_NUM_DEV = 4 ]; then - export CUDA_VISIBLE_DEVICES=0,1,4,5 - fi - export RES_DIR=test_result_${DATE}_${OF_ORDER}_${MY_NUM_DEV}cards - if [ ! -d "$RES_DIR" ]; then - mkdir $RES_DIR - fi - - for n in 32 - do - for w in 2 - do - for m in 1 - do - for iter in 1 2 3 - do - export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" - ## Time - echo $(date +%F%n%T)>> $RES_PATH - for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G - do - ## Test - /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH - done - done - done - done - done -done diff --git a/test_scripts/ofccl/static.sh b/test_scripts/ofccl/static.sh deleted file mode 100755 index 3a65584..0000000 --- a/test_scripts/ofccl/static.sh +++ /dev/null @@ -1,21 +0,0 @@ -g++ statics_ofccl.cpp -o statics_ofccl.out - -g++ statics_totalCtx.cpp -o statics_totalCtx.out -export RES_DIR=test_result_221120_2cards -export OUTPUT_PATH="./$RES_DIR/result_statics_all.txt" -echo $(date +%F%n%T)>>$OUTPUT_PATH -for n in 4 -do - for w in 2 - do - for M in 4 - do - for iter in 1 2 3 - do - export INPUT_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_M"$M".txt" - ./statics_ofccl.out $INPUT_PATH $OUTPUT_PATH - ./statics_totalCtx.out $INPUT_PATH $OUTPUT_PATH - done - done - done -done diff --git a/test_scripts/ofccl/static_time.cpp b/test_scripts/ofccl/static_time.cpp deleted file mode 100644 index c079845..0000000 --- a/test_scripts/ofccl/static_time.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include"bits/stdc++.h" -#include -using namespace std; -int main(int argc,char* argv[]){ - //cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]< a; - vector b; - string ss="bandwidth"; - string str = "N/A"; - while(getline(cin, inputLine)){ - if (inputLine.find(str,0) == -1) - continue; - - stringstream line; - line << inputLine; - double tmp; - line >> tmp; - - a.push_back(tmp); - } - cout << argv[1]<<" time: "<>$OUTPUT_PATH -for n in 4 -do - for w in 2 - do - for M in 4 - do - for iter in 1 2 3 - do - export INPUT_PATH="./$RES_DIR/test_result_"$iter"_n"$n"_w"$w"_M"$M".txt" - ./static_time.out $INPUT_PATH $OUTPUT_PATH - - done - done - done -done diff --git a/test_scripts/ofccl/statics_ofccl.cpp b/test_scripts/ofccl/statics_ofccl.cpp deleted file mode 100644 index 462fffe..0000000 --- a/test_scripts/ofccl/statics_ofccl.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include"bits/stdc++.h" -#include -using namespace std; -int main(int argc,char* argv[]){ - //cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]< a; - vector b; - string ss="bandwidth"; - string str = "N/A"; - while(getline(cin, inputLine)){ - if (inputLine.find(str,0) == -1) - continue; - - stringstream line; - line << inputLine; - double tmp; - line >> tmp; - line >> tmp; - a.push_back(tmp); - line >> tmp; - b.push_back(tmp); - } - cout << argv[1]<<" algbw: "< Date: Fri, 23 Dec 2022 06:50:07 +0000 Subject: [PATCH 075/109] =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=91=BD=E5=90=8Dbug?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 06799ae..9eb5383 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -11,13 +11,13 @@ # 设置超参数 # run DATE="221222" -runNcclTest = False # 运行nccl测试 +runNcclTest = True # 运行nccl测试 collectNcclResult = True # 统计nccl测试结果,写入xls -runOfcclTest = False# 运行ofccl测试 +runOfcclTest = True# 运行ofccl测试 collectOfcclResult = True # 统计ofccl测试结果,写入xls NCCL_ORDER="1" -resultXlsName="result_"+DATA+"_"+NCCL_ORDER+".xls" +resultXlsName="result_"+DATE+"_"+NCCL_ORDER+".xls" n = 2 m = 3 #nccl w = 2 From 9daa76c9f20fda4b3c7a3e9752f6d54aa4271063 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 23 Dec 2022 08:36:57 +0000 Subject: [PATCH 076/109] =?UTF-8?q?deltaSec=E7=9C=8B=E8=B5=B7=E6=9D=A5?= =?UTF-8?q?=E7=BB=9F=E8=AE=A1=E5=BE=97=E5=81=8F=E5=A4=A7=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nccl_test.sh | 2 +- ofccl_test.sh | 6 +++--- src_simple/common_simple.cu | 27 ++++++++++++++++++++++----- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/nccl_test.sh b/nccl_test.sh index b5ca1d9..76cd861 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -19,7 +19,7 @@ if [ "$BINARY" == "DEBUG" ];then export MY_NUM_DEV=8 # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=0 - export NITER=16 + export NITER=8 export NBYTES=8K export WARMITER=2 export MITER=1 diff --git a/ofccl_test.sh b/ofccl_test.sh index 073c8d0..2070999 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -45,7 +45,7 @@ if [ "$BINARY" == "DEBUG" ];then export MY_NUM_DEV=8 # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=0 - export NITER=8 + export NITER=5 export NBYTES=8K export WARMITER=2 export MITER=1 @@ -55,10 +55,10 @@ elif [ "$BINARY" == "PERF" ];then export MY_NUM_DEV=8 # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=0 - export NITER=1 + export NITER=8 export NBYTES=8K export WARMITER=2 - export MITER=16 + export MITER=1 export CHECK=0 elif [ "$BINARY" == "MS" ];then target="./build/ofccl_all_reduce_ms_perf" diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 6701244..bac1c31 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -800,12 +800,22 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t size_t count = args->nbytes / wordSize(type); + // Sync,参考nccl,把这个也加上吧。 + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, in_place, + 0 * multi_iters + miter, miter, rankCtx)); + } + TESTCHECK(completeColl(args)); + Barrier(args); // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { + auto iter_start = std::chrono::high_resolution_clock::now(); + for (int miter = 0; miter < multi_iters; miter++) { seenCqe[miter] = 0; TESTCHECK(startColl(args, type, op, root, in_place, @@ -814,9 +824,15 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t TESTCHECK(completeColl(args)); - // int cudaDev; - // cudaGetDevice(&cudaDev); + auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start; + double iter_deltaSec = + std::chrono::duration_cast>(iter_delta).count(); + + int cudaDev; + cudaGetDevice(&cudaDev); // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + if (cudaDev == 0) + OFTEST_LOG(TEST, "Rank<%d>, iter=%d, time = %lfus", cudaDev, iter, iter_deltaSec * 1.0E6); } auto delta = std::chrono::high_resolution_clock::now() - start; @@ -825,9 +841,10 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t deltaSec = deltaSec / (iters * multi_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec / cudaGraphLaunches; - // int cudaDev; - // cudaGetDevice(&cudaDev); - // OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters); + int cudaDev; + cudaGetDevice(&cudaDev); + if (cudaDev == 0) + OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters); Allreduce(args, &deltaSec, average); From 1f58d485744e2e6be6cfee389c05ab41094aed81 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 23 Dec 2022 08:59:46 +0000 Subject: [PATCH 077/109] meaningless NEW_TIMER --- src_simple/common_simple.cu | 45 ++++++++++++++++++++++++------------- src_simple/common_simple.h | 2 ++ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index bac1c31..884fdba 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -811,10 +811,17 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t Barrier(args); // Performance Benchmark - auto start = std::chrono::high_resolution_clock::now(); + #ifdef NEW_TIMER + double deltaSec = 0.0; + #else + auto start = std::chrono::high_resolution_clock::now(); + #endif + for (int iter = 0; iter < iters; iter++) { - auto iter_start = std::chrono::high_resolution_clock::now(); + #ifdef NEW_TIMER + auto iter_start = std::chrono::high_resolution_clock::now(); + #endif for (int miter = 0; miter < multi_iters; miter++) { seenCqe[miter] = 0; @@ -823,21 +830,29 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t } TESTCHECK(completeColl(args)); - - auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start; - double iter_deltaSec = - std::chrono::duration_cast>(iter_delta).count(); - - int cudaDev; - cudaGetDevice(&cudaDev); - // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); - if (cudaDev == 0) - OFTEST_LOG(TEST, "Rank<%d>, iter=%d, time = %lfus", cudaDev, iter, iter_deltaSec * 1.0E6); + + #ifdef NEW_TIMER + auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start; + double iter_deltaSec = std::chrono::duration_cast>(iter_delta).count(); + + int cudaDev; + cudaGetDevice(&cudaDev); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + if (cudaDev == 0) + OFTEST_LOG(TEST, "Rank<%d>, iter=%d, time = %lfus", cudaDev, iter, iter_deltaSec * 1.0E6); + #endif + + #ifdef NEW_TIMER + deltaSec += iter_deltaSec; + #endif } - auto delta = std::chrono::high_resolution_clock::now() - start; - double deltaSec = - std::chrono::duration_cast>(delta).count(); + #ifndef NEW_TIMER + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = + std::chrono::duration_cast>(delta).count(); + #endif + deltaSec = deltaSec / (iters * multi_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec / cudaGraphLaunches; diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index 406f634..c80dfa9 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -18,6 +18,8 @@ // #define DEBUG_PRINT 1 +// #define NEW_TIMER 1 + #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) #define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__) From fc09438e79c926434fd9cd77b1b88afd9859a923 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 23 Dec 2022 12:01:05 +0000 Subject: [PATCH 078/109] check frequency --- nccl_test.sh | 12 ++++++--- ofccl_test.sh | 16 ++++++++---- src_simple/common_simple.cu | 50 ++++++++++++++++++++++++++++++++----- src_simple/common_simple.h | 3 +++ 4 files changed, 67 insertions(+), 14 deletions(-) diff --git a/nccl_test.sh b/nccl_test.sh index 76cd861..89ba9a8 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -17,7 +17,9 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/all_reduce_perf" export MY_NUM_DEV=8 - # export CUDA_VISIBLE_DEVICES=0,1,4,5 + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi export SHOW_ALL_PREPARED_COLL=0 export NITER=8 export NBYTES=8K @@ -27,7 +29,9 @@ if [ "$BINARY" == "DEBUG" ];then elif [ "$BINARY" == "PERF" ];then target="./build/all_reduce_perf" export MY_NUM_DEV=8 - # export CUDA_VISIBLE_DEVICES=0,1,4,5 + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi export SHOW_ALL_PREPARED_COLL=0 export NITER=4 export NBYTES=8K @@ -37,7 +41,9 @@ elif [ "$BINARY" == "PERF" ];then elif [ "$BINARY" == "MS" ];then export MY_NUM_DEV=8 # target="./build/ofccl_all_reduce_ms_perf" - # # export CUDA_VISIBLE_DEVICES=0,1,4,5 + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi # export NITER=200 # export SHOW_ALL_PREPARED_COLL=1 # export WARMITER=0 diff --git a/ofccl_test.sh b/ofccl_test.sh index 2070999..2745011 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -42,18 +42,22 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=8 - # export CUDA_VISIBLE_DEVICES=0,1,4,5 + export MY_NUM_DEV=4 + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=8K + export NBYTES=256 export WARMITER=2 export MITER=1 export CHECK=0 elif [ "$BINARY" == "PERF" ];then target="./build/ofccl_all_reduce_perf" export MY_NUM_DEV=8 - # export CUDA_VISIBLE_DEVICES=0,1,4,5 + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi export SHOW_ALL_PREPARED_COLL=0 export NITER=8 export NBYTES=8K @@ -63,7 +67,9 @@ elif [ "$BINARY" == "PERF" ];then elif [ "$BINARY" == "MS" ];then target="./build/ofccl_all_reduce_ms_perf" export MY_NUM_DEV=8 - # export CUDA_VISIBLE_DEVICES=0,1,4,5 + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi export NITER=200 export SHOW_ALL_PREPARED_COLL=1 export WARMITER=0 diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 884fdba..b7ef3ab 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -103,6 +103,28 @@ static int average = 1; static thread_local CallBackArgs cbArgList[MAX_COLL_NUM]; static thread_local int seenCqe[MAX_COLL_NUM]; +// bool StringToInteger(const std::string& str, int64_t* value) { +// char* end; +// int64_t v = std::strtoll(str.data(), &end, 10); +// if (end == str.data()) { +// return false; +// } else { +// *value = v; +// return true; +// } +// } + +// static int64_t ParseIntegerFromEnv(const std::string& env_var, int64_t default_value) { +// const char* env_p = std::getenv(env_var.c_str()); +// if (env_p == nullptr) { return default_value; } +// int64_t value; +// if (StringToInteger(env_p, &value)) { +// return value; +// } else { +// return default_value; +// } +// } + static double parsesize(const char *value) { long long int units; double size; @@ -810,6 +832,9 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t Barrier(args); + // int64_t NEW_TIMER = ParseIntegerFromEnv("NEW_TIMER", 0); + // int64_t SHOW_ITER_TIME = ParseIntegerFromEnv("SHOW_ITER_TIME", 0); + // Performance Benchmark #ifdef NEW_TIMER double deltaSec = 0.0; @@ -819,7 +844,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t for (int iter = 0; iter < iters; iter++) { - #ifdef NEW_TIMER + #if defined(NEW_TIMER) || defined(SHOW_ITER_TIME) auto iter_start = std::chrono::high_resolution_clock::now(); #endif @@ -831,7 +856,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t TESTCHECK(completeColl(args)); - #ifdef NEW_TIMER + #if defined(NEW_TIMER) || defined(SHOW_ITER_TIME) auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start; double iter_deltaSec = std::chrono::duration_cast>(iter_delta).count(); @@ -856,10 +881,23 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t deltaSec = deltaSec / (iters * multi_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec / cudaGraphLaunches; - int cudaDev; - cudaGetDevice(&cudaDev); - if (cudaDev == 0) - OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters); + + #ifdef SHOW_AVG_TIME + int cudaDev; + cudaGetDevice(&cudaDev); + if (cudaDev == 0) + OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters); + + // int clockRate; + // cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, cudaDev); + // int memoryClockRate; + // cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, cudaDev); + // OFTEST_LOG(TEST, "Rank<%d>, clockRate = %d, memoryClockRate = %d", cudaDev, clockRate, memoryClockRate); + + // cudaDeviceProp prop; + // cudaGetDeviceProperties(&prop, cudaDev); + // OFTEST_LOG(TEST, "Rank<%d>, prop.clockRate = %d, prop.memoryClockRate = %d", cudaDev, prop.clockRate, prop.memoryClockRate); + #endif Allreduce(args, &deltaSec, average); diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index c80dfa9..1e61943 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -16,9 +16,12 @@ #include #include "nccl1_compat.h" +// 环境变量是方便,但是会多一些判断,可能影响性能。 // #define DEBUG_PRINT 1 // #define NEW_TIMER 1 +#define SHOW_ITER_TIME 1 +#define SHOW_AVG_TIME 1 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) From 266d3c8099340e63ecb9bd2861c1c03102d83aa2 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 23 Dec 2022 12:56:02 +0000 Subject: [PATCH 079/109] update xls name and ndev --- ofccl_test.sh | 6 +++--- test_scripts/auto_test.py | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 3465366..c63c780 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -42,11 +42,11 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=2 + export MY_NUM_DEV=8 # export CUDA_VISIBLE_DEVICES=0,1,4,5 export SHOW_ALL_PREPARED_COLL=0 - export NITER=16 - export NBYTES=8K + export NITER=8 + export NBYTES=128K export WARMITER=2 export MITER=1 export CHECK=0 diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 9eb5383..3edf8b2 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -17,12 +17,19 @@ collectOfcclResult = True # 统计ofccl测试结果,写入xls NCCL_ORDER="1" -resultXlsName="result_"+DATE+"_"+NCCL_ORDER+".xls" -n = 2 -m = 3 #nccl +host=os.environ.get("HOST") +n = 8 +m = 1 #nccl w = 2 -M = 3 #ofccl -NUM_DEV = 4#设备的卡数,实验用到的卡数写在循环里 +M = 1 #ofccl +if host=="oneflow-15" or host=="oneflow-16": + NUM_DEV = 4#设备的总卡数,实验用到的卡数写在循环里 + ncards = [2,4] +else: + NUM_DEV = 8 + ncards = [2,4,8] + +resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+m+"_n"+n+"_w"+w+".xls" # static os.system("g++ ./nccl/static_nccl.cpp -o ./nccl/static_nccl.out") @@ -36,7 +43,7 @@ bwSheet = table.add_sheet('bw') tmSheet = table.add_sheet('time') cnt = 0 -for MY_NUM_DEV in [2,4]: +for MY_NUM_DEV in ncards: if 'CUDA_VISIBLE_DEVICES' in os.environ: del os.environ['CUDA_VISIBLE_DEVICES'] From 4702178642bf91ab7641f68ede153dcac18e5c02 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 23 Dec 2022 13:09:55 +0000 Subject: [PATCH 080/109] add log --- ofccl_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 2745011..6bc92ba 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -42,7 +42,7 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=4 + export MY_NUM_DEV=8 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi From 5f7b4bfdcc98c24359dd8e46e769b7fe9060dc67 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 23 Dec 2022 13:30:56 +0000 Subject: [PATCH 081/109] update env --- test_scripts/auto_test.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 3edf8b2..a99abab 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -5,12 +5,18 @@ os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib" os.environ['NCCL_PROTO'] = "Simple" os.environ['NCCL_ALGO'] = "RING" + +os.environ['TRAVERSE_TIMES'] = "10" +os.environ['TOLERANT_UNPROGRESSED_CNT'] = "10000" +os.environ['BASE_CTX_SWITCH_THRESHOLD'] = "80" +os.environ['BOUNS_SWITCH_4_PROCESSED_COLL'] = "0" +os.environ['DEV_TRY_ROUND'] = "10" # test # f = os.popen("./nccl/run.sh") # print(f.readlines()) # 设置超参数 # run -DATE="221222" +DATE="221223" runNcclTest = True # 运行nccl测试 collectNcclResult = True # 统计nccl测试结果,写入xls runOfcclTest = True# 运行ofccl测试 @@ -29,7 +35,7 @@ NUM_DEV = 8 ncards = [2,4,8] -resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+m+"_n"+n+"_w"+w+".xls" +resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+str(m)+"n"+str(n)+"w"+str(w)+".xls" # static os.system("g++ ./nccl/static_nccl.cpp -o ./nccl/static_nccl.out") From 5ced1a08f2681f3dd90f6f63f1d70a68cad37bc0 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 24 Dec 2022 04:59:06 +0000 Subject: [PATCH 082/109] +nccl ofccl run.sh --- .gitignore | 2 +- nccl_test.sh | 4 +-- ofccl_test.sh | 4 +-- src_simple/common_simple.h | 4 +-- test_scripts/nccl/run_nccl.sh | 42 +++++++++++++++++++++++++++ test_scripts/ofccl/run_ofccl.sh | 50 +++++++++++++++++++++++++++++++++ 6 files changed, 99 insertions(+), 7 deletions(-) create mode 100755 test_scripts/nccl/run_nccl.sh create mode 100755 test_scripts/ofccl/run_ofccl.sh diff --git a/.gitignore b/.gitignore index 81a260f..0eba5f0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,6 @@ .vscode -test_result*/ +*_result*/ *.xls *.out \ No newline at end of file diff --git a/nccl_test.sh b/nccl_test.sh index 89ba9a8..e96be13 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -16,13 +16,13 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/all_reduce_perf" - export MY_NUM_DEV=8 + export MY_NUM_DEV=2 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi export SHOW_ALL_PREPARED_COLL=0 export NITER=8 - export NBYTES=8K + export NBYTES=64 export WARMITER=2 export MITER=1 export CHECK=0 diff --git a/ofccl_test.sh b/ofccl_test.sh index 6bc92ba..2dd6284 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -42,13 +42,13 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=8 + export MY_NUM_DEV=2 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=256 + export NBYTES=128 export WARMITER=2 export MITER=1 export CHECK=0 diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index 1e61943..e8ef280 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -20,8 +20,8 @@ // #define DEBUG_PRINT 1 // #define NEW_TIMER 1 -#define SHOW_ITER_TIME 1 -#define SHOW_AVG_TIME 1 +// #define SHOW_ITER_TIME 1 +// #define SHOW_AVG_TIME 1 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh new file mode 100755 index 0000000..58c26fd --- /dev/null +++ b/test_scripts/nccl/run_nccl.sh @@ -0,0 +1,42 @@ +export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +export DATE=221224 +export NCCL_ORDER=1 + +for MY_NUM_DEV in 2 4 8 +do + unset CUDA_VISIBLE_DEVICES + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + export RES_DIR=run_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards + if [ ! -d "$RES_DIR" ]; then + mkdir $RES_DIR + fi + + for n in 8 + do + for w in 2 + do + for m in 1 + do + for iter in 1 + do + export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" + ## Time + echo $(date +%F%n%T)>> $RES_PATH + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G + do + ## Test + /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH + done + done + done + done + done +done diff --git a/test_scripts/ofccl/run_ofccl.sh b/test_scripts/ofccl/run_ofccl.sh new file mode 100755 index 0000000..1f6c486 --- /dev/null +++ b/test_scripts/ofccl/run_ofccl.sh @@ -0,0 +1,50 @@ +export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +export NCCL_MAX_NCHANNELS=1 +export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +export DATE=221224 +export NCCL_ORDER=1 + +export TRAVERSE_TIMES=10 +export TOLERANT_UNPROGRESSED_CNT=10000 +export BASE_CTX_SWITCH_THRESHOLD=80 +export BOUNS_SWITCH_4_PROCESSED_COLL=0 +export DEV_TRY_ROUND=10 + +# export SHOW_ALL_PREPARED_COLL=1 + +for MY_NUM_DEV in 2 4 8 +do + unset CUDA_VISIBLE_DEVICES + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + export RES_DIR=run_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards + if [ ! -d "$RES_DIR" ]; then + mkdir $RES_DIR + fi + + for n in 5 + do + for w in 2 + do + for m in 1 + do + for iter in 1 + do + export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" + ## Time + echo $(date +%F%n%T)>> $RES_PATH + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G + do + ## Test + /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH + done + done + done + done + done +done From 875d1d5753c1bd1664e4504e2432c91c9415baf5 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 25 Dec 2022 11:16:33 +0000 Subject: [PATCH 083/109] report rank 0 avg time --- nccl_test.sh | 4 ++-- ofccl_test.sh | 4 ++-- src/common.cu | 2 +- src_simple/common_simple.cu | 2 +- src_simple/common_simple.h | 2 +- test_scripts/nccl/run_nccl.sh | 2 +- test_scripts/ofccl/run_ofccl.sh | 8 ++++---- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/nccl_test.sh b/nccl_test.sh index e96be13..4ce69c7 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -21,8 +21,8 @@ if [ "$BINARY" == "DEBUG" ];then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi export SHOW_ALL_PREPARED_COLL=0 - export NITER=8 - export NBYTES=64 + export NITER=5 + export NBYTES=4K export WARMITER=2 export MITER=1 export CHECK=0 diff --git a/ofccl_test.sh b/ofccl_test.sh index 2dd6284..33982e7 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -42,13 +42,13 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=2 + export MY_NUM_DEV=8 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=128 + export NBYTES=64M export WARMITER=2 export MITER=1 export CHECK=0 diff --git a/src/common.cu b/src/common.cu index 716362b..f22be54 100644 --- a/src/common.cu +++ b/src/common.cu @@ -780,7 +780,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* setupArgs(size, type, args); print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); TESTCHECK(BenchTime(args, type, op, root, 0)); - TESTCHECK(BenchTime(args, type, op, root, 1)); + // TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); } return testSuccess; diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index b7ef3ab..52d6be6 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -886,7 +886,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t int cudaDev; cudaGetDevice(&cudaDev); if (cudaDev == 0) - OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters); + OFTEST_LOG(TEST, "Rank<%d>, time = %lf us, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters); // int clockRate; // cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, cudaDev); diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index e8ef280..8801172 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -21,7 +21,7 @@ // #define NEW_TIMER 1 // #define SHOW_ITER_TIME 1 -// #define SHOW_AVG_TIME 1 +#define SHOW_AVG_TIME 1 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh index 58c26fd..c8d5ec9 100755 --- a/test_scripts/nccl/run_nccl.sh +++ b/test_scripts/nccl/run_nccl.sh @@ -19,7 +19,7 @@ do mkdir $RES_DIR fi - for n in 8 + for n in 5 do for w in 2 do diff --git a/test_scripts/ofccl/run_ofccl.sh b/test_scripts/ofccl/run_ofccl.sh index 1f6c486..ed99b51 100755 --- a/test_scripts/ofccl/run_ofccl.sh +++ b/test_scripts/ofccl/run_ofccl.sh @@ -1,12 +1,12 @@ export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib export NCCL_PROTO=Simple export NCCL_ALGO=Ring -export NCCL_MAX_NCHANNELS=1 -export NCCL_MIN_NCHANNELS=1 +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 # export NCCL_NTHREADS=64 -export DATE=221224 -export NCCL_ORDER=1 +export DATE=221225 +export NCCL_ORDER=2 export TRAVERSE_TIMES=10 export TOLERANT_UNPROGRESSED_CNT=10000 From 386ee920d7de62249adfcc49b08c3331f57b1965 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 25 Dec 2022 11:27:53 +0000 Subject: [PATCH 084/109] scripts --- .gitignore | 2 +- nccl_test.sh | 2 +- test_scripts/nccl/run_nccl.sh | 4 ++-- test_scripts/ofccl/run_ofccl.sh | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 0eba5f0..99f99d6 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,6 @@ .vscode -*_result*/ +*result*/ *.xls *.out \ No newline at end of file diff --git a/nccl_test.sh b/nccl_test.sh index 4ce69c7..b938806 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -22,7 +22,7 @@ if [ "$BINARY" == "DEBUG" ];then fi export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=4K + export NBYTES=64M export WARMITER=2 export MITER=1 export CHECK=0 diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh index c8d5ec9..7a0de3c 100755 --- a/test_scripts/nccl/run_nccl.sh +++ b/test_scripts/nccl/run_nccl.sh @@ -5,7 +5,7 @@ export NCCL_ALGO=Ring # export NCCL_MIN_NCHANNELS=1 # export NCCL_NTHREADS=64 -export DATE=221224 +export DATE=221225 export NCCL_ORDER=1 for MY_NUM_DEV in 2 4 8 @@ -14,7 +14,7 @@ do if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi - export RES_DIR=run_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards + export RES_DIR=result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards if [ ! -d "$RES_DIR" ]; then mkdir $RES_DIR fi diff --git a/test_scripts/ofccl/run_ofccl.sh b/test_scripts/ofccl/run_ofccl.sh index ed99b51..3be6bf7 100755 --- a/test_scripts/ofccl/run_ofccl.sh +++ b/test_scripts/ofccl/run_ofccl.sh @@ -6,7 +6,7 @@ export NCCL_ALGO=Ring # export NCCL_NTHREADS=64 export DATE=221225 -export NCCL_ORDER=2 +export NCCL_ORDER=3 export TRAVERSE_TIMES=10 export TOLERANT_UNPROGRESSED_CNT=10000 @@ -22,7 +22,7 @@ do if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi - export RES_DIR=run_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards + export RES_DIR=result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards if [ ! -d "$RES_DIR" ]; then mkdir $RES_DIR fi From 13567f2f491f9afca09ee4602ddf3f8b60bf6ef9 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 25 Dec 2022 12:29:03 +0000 Subject: [PATCH 085/109] nccl show each kernel time --- nccl_test.sh | 2 +- src/common.cu | 76 +++++++++++++++++++++-------------- src/common.h | 2 + test_scripts/nccl/run_nccl.sh | 2 +- 4 files changed, 50 insertions(+), 32 deletions(-) diff --git a/nccl_test.sh b/nccl_test.sh index b938806..80a203f 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -22,7 +22,7 @@ if [ "$BINARY" == "DEBUG" ];then fi export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=64M + export NBYTES=64 export WARMITER=2 export MITER=1 export CHECK=0 diff --git a/src/common.cu b/src/common.cu index f22be54..fea29f0 100644 --- a/src/common.cu +++ b/src/common.cu @@ -735,11 +735,13 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } else { sprintf(timeStr, "%7.2f", timeUsec); } - if (datacheck) { - PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); - } else { - PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); - } + #ifndef NCCL_DEBUG_CLOCK + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + #endif args->bw[0] += busBw; args->bw_count[0]++; @@ -778,7 +780,10 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* // Benchmark for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); - print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + + #ifndef NCCL_DEBUG_CLOCK + print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + #endif TESTCHECK(BenchTime(args, type, op, root, 0)); // TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); @@ -1030,13 +1035,16 @@ testResult_t run() { #endif is_main_thread = (proc == 0) ? 1 : 0; - PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, - (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); - if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); - if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); - PRINT("#\n"); + #ifndef NCCL_DEBUG_CLOCK + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); + if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); + #endif - PRINT("# Using devices\n"); #define MAX_LINE 2048 char line[MAX_LINE]; int len = 0; @@ -1051,20 +1059,21 @@ testResult_t run() { maxMem = std::min(maxMem, prop.totalGlobalMem); } -#if MPI_SUPPORT - char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL; - // Gather all output in rank order to root (0) - MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD); - if (proc == 0) { - for (int p = 0; p < nProcs; p++) - PRINT("%s", lines+MAX_LINE*p); - free(lines); - } - MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD); -#else - PRINT("%s", line); +#ifndef NCCL_DEBUG_CLOCK + #if MPI_SUPPORT + char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL; + // Gather all output in rank order to root (0) + MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD); + if (proc == 0) { + for (int p = 0; p < nProcs; p++) + PRINT("%s", lines+MAX_LINE*p); + free(lines); + } + MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD); + #else + PRINT("%s", line); + #endif #endif - // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest. size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2); if (maxBytes > memMaxBytes) { @@ -1121,8 +1130,10 @@ testResult_t run() { errors[t] = bw_count[t] = 0; } - PRINT("#\n"); - print_header(); + #ifndef NCCL_DEBUG_CLOCK + PRINT("#\n"); + print_header(); + #endif int* sync = (int*)calloc(2, sizeof(int)); int* barrier = (int*)calloc(2, sizeof(int)); @@ -1202,9 +1213,14 @@ testResult_t run() { double check_avg_bw = str ? atof(str) : -1; bw[0] /= bw_count[0]; - PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); - PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); - PRINT("#\n"); + #ifndef NCCL_DEBUG_CLOCK + PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); + PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); + PRINT("#\n"); + #else + PRINT("\n"); + PRINT("\n"); + #endif #ifdef MPI_SUPPORT MPI_Finalize(); #endif diff --git a/src/common.h b/src/common.h index 745bd76..a6703b2 100644 --- a/src/common.h +++ b/src/common.h @@ -18,6 +18,8 @@ #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) +// #define NCCL_DEBUG_CLOCK 1 + #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh index 7a0de3c..d69bf7d 100755 --- a/test_scripts/nccl/run_nccl.sh +++ b/test_scripts/nccl/run_nccl.sh @@ -6,7 +6,7 @@ export NCCL_ALGO=Ring # export NCCL_NTHREADS=64 export DATE=221225 -export NCCL_ORDER=1 +export NCCL_ORDER=4 for MY_NUM_DEV in 2 4 8 do From 12969b5b09eb9417b5b6a1905b95446913775193 Mon Sep 17 00:00:00 2001 From: novaCoder-zrk Date: Tue, 27 Dec 2022 09:18:53 +0000 Subject: [PATCH 086/109] =?UTF-8?q?=E8=83=BD=E5=A4=84=E7=90=86=E5=9D=87?= =?UTF-8?q?=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 161 +++++++++++++-------- test_scripts/ofccl/static_ofccl_QE.cpp | 174 +++++++++++++++++++++++ test_scripts/ofccl/static_ofccl_bw.cpp | 43 ++++++ test_scripts/ofccl/static_ofccl_time.cpp | 40 ++++++ 4 files changed, 360 insertions(+), 58 deletions(-) create mode 100644 test_scripts/ofccl/static_ofccl_QE.cpp create mode 100644 test_scripts/ofccl/static_ofccl_bw.cpp create mode 100644 test_scripts/ofccl/static_ofccl_time.cpp diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index a99abab..0c4d9c7 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -1,6 +1,11 @@ import os import xlrd import xlwt +# 设置字体大小 +style = xlwt.XFStyle() +font = xlwt.Font() +font.height = 20*16 +style.font = font # 设置环境变量 os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib" os.environ['NCCL_PROTO'] = "Simple" @@ -11,20 +16,23 @@ os.environ['BASE_CTX_SWITCH_THRESHOLD'] = "80" os.environ['BOUNS_SWITCH_4_PROCESSED_COLL'] = "0" os.environ['DEV_TRY_ROUND'] = "10" -# test -# f = os.popen("./nccl/run.sh") -# print(f.readlines()) + # 设置超参数 -# run -DATE="221223" -runNcclTest = True # 运行nccl测试 -collectNcclResult = True # 统计nccl测试结果,写入xls -runOfcclTest = True# 运行ofccl测试 -collectOfcclResult = True # 统计ofccl测试结果,写入xls +DATE="221226" +runNcclTest = False # 运行nccl测试,仅输出原始结果 +staticNccl = False +collectNcclResult = True # 统计nccl测试结果,写入xls + + +runOfcclTest = False# 运行ofccl测试 +staticOfccl = True +staticOfcclExtral = True # 对ofccl的额外输出进行统计 +collectOfcclResult = True# 统计ofccl测试结果,写入xls + NCCL_ORDER="1" host=os.environ.get("HOST") -n = 8 +n = 5 m = 1 #nccl w = 2 M = 1 #ofccl @@ -40,14 +48,19 @@ # static os.system("g++ ./nccl/static_nccl.cpp -o ./nccl/static_nccl.out") os.system("g++ ./nccl/static_time.cpp -o ./nccl/static_time.out") -os.system("g++ ./ofccl/clear_static_ofccl_time.cpp -o ./ofccl/clear_static_ofccl_time.out") -os.system("g++ ./ofccl/clear_static_ofccl.cpp -o ./ofccl/clear_static_ofccl.out") - +os.system("g++ ./ofccl/static_ofccl_time.cpp -o ./ofccl/static_ofccl_time.out") +os.system("g++ ./ofccl/static_ofccl_bw.cpp -o ./ofccl/static_ofccl_bw.out") +os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out") table = xlwt.Workbook() bwSheet = table.add_sheet('bw') tmSheet = table.add_sheet('time') +# 列宽 +for i in range(30): + bwSheet.col(i).width = 13 * 256 + tmSheet.col(i).width = 16 * 256 + cnt = 0 for MY_NUM_DEV in ncards: @@ -65,24 +78,24 @@ NCCL_OUTPUT_TIME_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards_time.txt" - if runNcclTest == True: + if staticNccl == True: os.system("echo $(date +%F%n%T)>>"+NCCL_OUTPUT_BW_PATH) os.system("echo $(date +%F%n%T)>>"+NCCL_OUTPUT_TIME_PATH) - for iter in [1,2,3]: - NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" - + for iter in [1,2,3]: + NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + if runNcclTest: os.system("echo $(date +%F%n%T)>> "+NCCL_RES_PATH) for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]: os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RES_PATH) - + if staticNccl: os.system("./nccl/static_nccl.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) os.system("./nccl/static_time.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) if collectNcclResult == True : # bus - bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡') + bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) with open(NCCL_OUTPUT_BW_PATH) as f: content = f.read() @@ -90,39 +103,39 @@ axis_y = ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"] for a in range(0,25): - bwSheet.write(2+a+cnt*30,0,axis_y[a]) + bwSheet.write(2+a+cnt*30,0,axis_y[a],style) # for k in [0,1,2]: - bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k)) + bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k),style) for i in range(0,25): - bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2]) + bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2],style) - bwSheet.write(1+cnt*30,1+15+k,'nccl-busbw'+str(k)) + bwSheet.write(1+cnt*30,12+k,'nccl-busbw'+str(k),style) for i in range(0,25): - bwSheet.write(2+i+cnt*30,1+15+k,bw[i+k*50+25+2]) + bwSheet.write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style) # avg - bwSheet.write(1+cnt*30, 4, 'avg-algbw') - bwSheet.write(1+cnt*30, 19, 'avg-busbw') + bwSheet.write(1+cnt*30, 4, 'avg-algbw',style) + bwSheet.write(1+cnt*30, 15, 'avg-busbw',style) for i in range(0,25): - bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') ) - bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3')) + bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style ) + bwSheet.write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) # time with open(NCCL_OUTPUT_TIME_PATH) as f2: content2 = f2.read() times = content2.split() - tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡') + tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) for a in range(0,25): - tmSheet.write(2+a+cnt*30,0,axis_y[a]) + tmSheet.write(2+a+cnt*30,0,axis_y[a],style) for k in [0,1,2]: - tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k)) + tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k),style) for i in range(0,25): - tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2]) + tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2],style) # avg - tmSheet.write(1+cnt*30, 4, 'avg-nccl') + tmSheet.write(1+cnt*30, 4, 'avg-nccl',style) for i in range(0,25): - tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') ) + tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') ,style) #OFCCL @@ -133,20 +146,26 @@ # 统计结果 OFCCL_OUTPUT_BW_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards.txt" OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt" + OFCCL_OUTPUT_QE_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE.txt" - if runOfcclTest == True: + if staticOfccl == True: os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH) os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_TIME_PATH) + if staticOfcclExtral: + os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_PATH) - for iter in [1,2,3]: - OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt" - + for iter in [1,2,3]: + OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt" + if runOfcclTest: os.system("echo $(date +%F%n%T)>> "+OFCCL_RES_PATH) for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]: os.system("../build/ofccl_all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ OFCCL_RES_PATH) + if staticOfccl: + os.system("./ofccl/static_ofccl_bw.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH) + os.system("./ofccl/static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH) + if staticOfcclExtral: + os.system("./ofccl/static_ofccl_QE.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_PATH) - os.system("./ofccl/clear_static_ofccl.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) - os.system("./ofccl/clear_static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) if collectOfcclResult == True: @@ -155,19 +174,19 @@ bw = content2.split() #bus for k in [0,1,2]: - bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k)) + bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style) for i in range(0,25): - bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2]) + bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2],style) - bwSheet.write(1+cnt*30,5+15+k,'ofccl-busbw'+str(k)) + bwSheet.write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style) for i in range(0,25): - bwSheet.write(2+i+cnt*30,5+15+k,bw[i+k*50+25+2]) + bwSheet.write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style) # avg - bwSheet.write(1+cnt*30, 4+4, 'avg-algbw') - bwSheet.write(1+cnt*30, 19+4, 'avg-busbw') + bwSheet.write(1+cnt*30,8, 'avg-algbw',style) + bwSheet.write(1+cnt*30, 19, 'avg-busbw',style) for i in range(0,25): - bwSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ) - bwSheet.write(2+i+cnt*30, 19+4, xlwt.Formula('SUM(U'+str(2+i+cnt*30+1)+',V'+str(2+i+cnt*30+1)+',W'+str(2+i+cnt*30+1)+')/3')) + bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ,style) + bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) # time with open(OFCCL_OUTPUT_TIME_PATH) as f2: @@ -175,22 +194,48 @@ times = content2.split() for k in [0,1,2]: - tmSheet.write(1+cnt*30,5+k,'OFccl-'+str(k)) + tmSheet.write(1+cnt*30,5+k,'ofccl-'+str(k),style) for i in range(0,25): - tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2]) + tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2],style) # avg - tmSheet.write(1+cnt*30, 4+4, 'avg-OFCCL') + tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style) for i in range(0,25): - tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ) + tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ,style) if collectNcclResult and collectOfcclResult: - bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl') - bwSheet.write(1+cnt*30, 24, '(ofccl-nccl)/nccl') - tmSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl') + bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style) + bwSheet.write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style) + tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style) + tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style) + for i in range(0,25): + bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)) ,style) + bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style) + tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style ) + tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style ) + + # time 各个列的标题 + if staticOfcclExtral: + tmSheet.write(1+cnt*30, 13,'nccl IO',style ) + tmSheet.write(1+cnt*30, 14,'nccl kern',style ) + tmSheet.write(1+cnt*30, 15,'ofccl-nccl kern',style ) + tmSheet.write(1+cnt*30, 16,'before after get sqe',style ) + tmSheet.write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style ) + tmSheet.write(1+cnt*30, 18,'before after put cqe',style ) + tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style ) + tmSheet.write(1+cnt*30, 20,'occl rank0 time',style ) + tmSheet.write(1+cnt*30, 21,'nccl kern ori',style ) + + with open(OFCCL_OUTPUT_QE_PATH) as f3: + content3 = f3.read() + times = content3.split() for i in range(0,25): - bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)) ) - bwSheet.write(2+i+cnt*30, 24, xlwt.Formula('(X'+str(2+i+cnt*30+1)+'-T'+str(2+i+cnt*30+1)+')/T'+str(2+i+cnt*30+1) )) - tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ) ) + tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style) + tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style) + tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style) + tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style) + tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style) + + cnt = cnt+1 diff --git a/test_scripts/ofccl/static_ofccl_QE.cpp b/test_scripts/ofccl/static_ofccl_QE.cpp new file mode 100644 index 0000000..3705bdb --- /dev/null +++ b/test_scripts/ofccl/static_ofccl_QE.cpp @@ -0,0 +1,174 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector time; + vector sqe; + vector beforeCqe; + vector putCqe; + vector afterCqe; + string bw="bandwidth"; + + int cnt = 0; + double sqe_sum = 0; + int sqe_cnt = 0; + + double beforeCqe_sum=0; + int beforeCqe_cnt = 0; + + double putCqe_sum = 0; + int putCqe_cnt = 0; + + double afterCqe_sum = 0; + int afterCqe_cnt = 0; + + while(getline(cin, inputLine)){ + if(inputLine.find(bw,0) != -1){ + // 判断结束一个输出 + // before after get sqe + double sqe_avg = sqe_sum / sqe_cnt; + sqe.push_back(sqe_avg); + sqe_sum = 0; + sqe_cnt =0; + // AfterSqe TO BeforeCqe + double beforeCqe_avg = beforeCqe_sum / beforeCqe_cnt; + beforeCqe.push_back(beforeCqe_avg); + beforeCqe_sum =0; + beforeCqe_cnt =0; + //before after put cqe + double putCqe_avg = putCqe_sum / putCqe_cnt; + putCqe.push_back(putCqe_avg); + putCqe_sum = 0; + putCqe_cnt = 0; + //beforeSqe TO afterCqe + double afterCqe_avg = afterCqe_sum/afterCqe_cnt; + afterCqe.push_back(afterCqe_avg); + afterCqe_sum=0; + afterCqe_cnt=0; + + if(++cnt == 25) + break; + } + // rank0 time + int pos = -1; + if ((pos=inputLine.find("time = ",0) ) != -1){ + pos += 7; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + time.push_back(t); + continue; + } + + // before after get sqe + if ((pos=inputLine.find("before after get sqe AVG",0) ) != -1){ + pos += 27; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + stringstream ss; + double tt; + ss << t; + ss >> tt; + pos=inputLine.find("weight = ",0); + pos +=9; + int count = inputLine[pos] - '0'; + sqe_sum += tt * count; + sqe_cnt += count; + continue; + } + //AfterSqe TO BeforeCqe + if ((pos=inputLine.find("AfterSqe TO BeforeCqe AVG",0) ) != -1){ + pos += 28; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + stringstream ss; + double tt; + ss << t; + ss >> tt; + pos=inputLine.find("weight = ",0); + pos +=9; + int count = inputLine[pos] - '0'; + beforeCqe_sum += tt * count; + beforeCqe_cnt += count; + continue; + } + + //before after put cqe + if ((pos=inputLine.find("before after put cqe AVG ",0) ) != -1){ + pos += 27; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + stringstream ss; + double tt; + ss << t; + ss >> tt; + pos=inputLine.find("weight = ",0); + pos +=9; + int count = inputLine[pos] - '0'; + putCqe_sum += tt * count; + putCqe_cnt += count; + continue; + } + //beforeSqe TO afterCqe + if ((pos=inputLine.find("beforeSqe TO afterCqe AVG = ",0) ) != -1){ + pos += 28; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + stringstream ss; + double tt; + ss << t; + ss >> tt; + pos=inputLine.find("weight = ",0); + pos +=9; + int count = inputLine[pos] - '0'; + afterCqe_sum += tt * count; + afterCqe_cnt += count; + continue; + } + + + } + + // before after get sqe + for (auto s:sqe){ + cout << s << endl; + } + cout < +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector a; + vector b; + string ss="bandwidth"; + string str = "N/A"; + int cnt = 0; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; + + stringstream line; + line << inputLine; + string tmp; + stack ss; + while(line >> tmp){ + ss.push(tmp); + } + ss.pop(); + b.push_back(ss.top()); + ss.pop(); + a.push_back(ss.top()); + + if(++cnt == 25) + break; + } + + for(auto a1:a) + cout< +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector a; + vector b; + string ss="bandwidth"; + string str = "N/A"; + int cnt = 0; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; + + stringstream line; + line << inputLine; + string tmp; + stack ss; + while(line >> tmp){ + ss.push(tmp); + } + ss.pop(); + ss.pop(); + ss.pop(); + a.push_back(ss.top()); + + if(++cnt == 25) + break; + } + + for(auto a1:a) + cout< Date: Tue, 27 Dec 2022 10:47:31 +0000 Subject: [PATCH 087/109] =?UTF-8?q?time=E9=A1=B5=E8=A1=A8=20R=E5=88=97-O?= =?UTF-8?q?=E5=88=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 0c4d9c7..4cb745c 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -18,15 +18,15 @@ os.environ['DEV_TRY_ROUND'] = "10" # 设置超参数 -DATE="221226" +DATE="221227" runNcclTest = False # 运行nccl测试,仅输出原始结果 -staticNccl = False +staticNccl = True collectNcclResult = True # 统计nccl测试结果,写入xls runOfcclTest = False# 运行ofccl测试 staticOfccl = True -staticOfcclExtral = True # 对ofccl的额外输出进行统计 +staticOfcclExtral = False # 对ofccl的额外输出进行统计 collectOfcclResult = True# 统计ofccl测试结果,写入xls @@ -229,6 +229,7 @@ content3 = f3.read() times = content3.split() for i in range(0,25): + tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'- O'+str(3+i+cnt*30) ),style ) tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style) tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style) tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style) From 2b4b937c9b674b9a2acd9a0afad0d3176eea3487 Mon Sep 17 00:00:00 2001 From: novaCoder-zrk Date: Tue, 27 Dec 2022 11:04:57 +0000 Subject: [PATCH 088/109] =?UTF-8?q?nccl=20kern=20=E6=B1=82=E5=B9=B3?= =?UTF-8?q?=E5=9D=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 4cb745c..57bab69 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -18,16 +18,16 @@ os.environ['DEV_TRY_ROUND'] = "10" # 设置超参数 -DATE="221227" +DATE="221226" runNcclTest = False # 运行nccl测试,仅输出原始结果 -staticNccl = True -collectNcclResult = True # 统计nccl测试结果,写入xls +staticNccl = False # 运行统计,输出中间结果 +collectNcclResult = True # 收集nccl测试结果,写入xls runOfcclTest = False# 运行ofccl测试 -staticOfccl = True -staticOfcclExtral = False # 对ofccl的额外输出进行统计 -collectOfcclResult = True# 统计ofccl测试结果,写入xls +staticOfccl = False # 运行统计,输出中间结果 +staticOfcclExtral = True # 对ofccl的额外输出进行统计 +collectOfcclResult = True# 收集ofccl测试结果,写入xls NCCL_ORDER="1" @@ -229,7 +229,8 @@ content3 = f3.read() times = content3.split() for i in range(0,25): - tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'- O'+str(3+i+cnt*30) ),style ) + tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('( V'+str(3+i+cnt*30)+'+W'+str(3+i+cnt*30)+'+X'+str(3+i+cnt*30)+'+Y'+str(3+i+cnt*30)+'+Z'+str(3+i+cnt*30)+' )/5' ),style ) + tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style) tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style) tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style) From 93b9ddcb52e753464643f236a82d8d21aa6a3c91 Mon Sep 17 00:00:00 2001 From: novaCoder-zrk Date: Wed, 28 Dec 2022 04:46:34 +0000 Subject: [PATCH 089/109] =?UTF-8?q?=E8=BE=93=E5=87=BA=20ori?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 22 +++- test_scripts/ofccl/static_ofccl_QE_ori.cpp | 120 +++++++++++++++++++++ 2 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 test_scripts/ofccl/static_ofccl_QE_ori.cpp diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 57bab69..964d5ea 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -7,6 +7,7 @@ font.height = 20*16 style.font = font # 设置环境变量 +#os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib" os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib" os.environ['NCCL_PROTO'] = "Simple" os.environ['NCCL_ALGO'] = "RING" @@ -18,7 +19,6 @@ os.environ['DEV_TRY_ROUND'] = "10" # 设置超参数 -DATE="221226" runNcclTest = False # 运行nccl测试,仅输出原始结果 staticNccl = False # 运行统计,输出中间结果 collectNcclResult = True # 收集nccl测试结果,写入xls @@ -29,7 +29,7 @@ staticOfcclExtral = True # 对ofccl的额外输出进行统计 collectOfcclResult = True# 收集ofccl测试结果,写入xls - +DATE="221226" NCCL_ORDER="1" host=os.environ.get("HOST") n = 5 @@ -147,12 +147,14 @@ OFCCL_OUTPUT_BW_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards.txt" OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt" OFCCL_OUTPUT_QE_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE.txt" + OFCCL_OUTPUT_QE_ORI_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" if staticOfccl == True: os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH) os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_TIME_PATH) if staticOfcclExtral: - os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_PATH) + os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_PATH) + os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_ORI_PATH) for iter in [1,2,3]: OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt" @@ -165,6 +167,7 @@ os.system("./ofccl/static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH) if staticOfcclExtral: os.system("./ofccl/static_ofccl_QE.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_PATH) + os.system("./ofccl/static_ofccl_QE_ori.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_ORI_PATH) if collectOfcclResult == True: @@ -224,10 +227,17 @@ tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style ) tmSheet.write(1+cnt*30, 20,'occl rank0 time',style ) tmSheet.write(1+cnt*30, 21,'nccl kern ori',style ) + tmSheet.write(1+cnt*30, 27,'before after get sqe ori',style ) + tmSheet.write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style ) + tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style ) + tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style ) with open(OFCCL_OUTPUT_QE_PATH) as f3: content3 = f3.read() times = content3.split() + with open(OFCCL_OUTPUT_QE_ORI_PATH) as f4: + content4 = f4.read() + times4 = content4.split() for i in range(0,25): tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('( V'+str(3+i+cnt*30)+'+W'+str(3+i+cnt*30)+'+X'+str(3+i+cnt*30)+'+Y'+str(3+i+cnt*30)+'+Z'+str(3+i+cnt*30)+' )/5' ),style ) tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) @@ -236,6 +246,12 @@ tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style) tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style) tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style) + for j in range(0,5): + tmSheet.write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style) + tmSheet.write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style) + tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style) + tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style) + diff --git a/test_scripts/ofccl/static_ofccl_QE_ori.cpp b/test_scripts/ofccl/static_ofccl_QE_ori.cpp new file mode 100644 index 0000000..08794b5 --- /dev/null +++ b/test_scripts/ofccl/static_ofccl_QE_ori.cpp @@ -0,0 +1,120 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + + vector sqe_ori; + vector beforeCqe_ori; + vector putCqe_ori; + vector afterCqe_ori; + string bw="bandwidth"; + + + int cnt=0; + while(getline(cin, inputLine)){ + if(inputLine.find(bw,0) != -1){ + // 判断结束一个输出 + // before after get sqe + + if(++cnt == 25) + break; + } + // rank0 time + int pos = -1; + // before after get sqe + if ((pos=inputLine.find("Rank<0> Blk<0> Thrd<0> coll_id = 0, before after get sqe = ",0) ) != -1){ + pos += 58; + string numbers = inputLine.substr(pos); + stringstream ss ; + ss << numbers; + for(int i = 0;i < 5;i++){ + double tmp; + ss >> tmp; + sqe_ori.push_back(tmp); + } + continue; + } + //AfterSqe TO BeforeCqe + if ((pos=inputLine.find("AfterSqe TO BeforeCqe = ",0) ) != -1){ + pos += 24; + string numbers = inputLine.substr(pos); + stringstream ss ; + ss << numbers; + for(int i = 0;i < 5;i++){ + double tmp; + ss >> tmp; + if(tmp > 0.00001) + beforeCqe_ori.push_back(tmp); + } + continue; + } + + //before after put cqe + if ((pos=inputLine.find("before after put cqe = ",0) ) != -1){ + pos += 23; + string numbers = inputLine.substr(pos); + stringstream ss ; + ss << numbers; + for(int i = 0;i < 5;i++){ + double tmp; + ss >> tmp; + if(tmp > 0.00001) + putCqe_ori.push_back(tmp); + } + continue; + } + + //beforeSqe TO afterCqe + if ((pos=inputLine.find("beforeSqe TO afterCqe = ",0) ) != -1){ + pos += 24; + string numbers = inputLine.substr(pos); + stringstream ss ; + ss << numbers; + for(int i = 0;i < 5;i++){ + double tmp; + ss >> tmp; + if(tmp > 0.00001) + afterCqe_ori.push_back(tmp); + } + continue; + } + } + + // before after get sqe + for(int i = 0;i <25;i++){ + for(int j =0;j < 5;j++) + cout< Date: Wed, 28 Dec 2022 10:06:41 +0000 Subject: [PATCH 090/109] =?UTF-8?q?=E7=BC=96=E8=AF=91=20QE=5Foricpp=20?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=80=E5=88=97=E5=AE=9E=E9=99=85=E7=9A=84?= =?UTF-8?q?byte=E6=95=B0=EF=BC=8C=E5=A2=9E=E5=8A=A0=20Ex-Ox=EF=BC=8C?= =?UTF-8?q?=E4=BF=AE=E6=94=B9average?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 964d5ea..4f8a1e9 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -51,6 +51,7 @@ os.system("g++ ./ofccl/static_ofccl_time.cpp -o ./ofccl/static_ofccl_time.out") os.system("g++ ./ofccl/static_ofccl_bw.cpp -o ./ofccl/static_ofccl_bw.out") os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out") +os.system("g++ ./ofccl/static_ofccl_QE_ori.cpp -o ./ofccl/static_ofccl_QE_ori.out") table = xlwt.Workbook() @@ -232,6 +233,11 @@ tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style ) tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style ) + y = 64 + for i in range(0,25): + tmSheet.write(2+i+cnt*30,12,y,style) + y = y*2 + with open(OFCCL_OUTPUT_QE_PATH) as f3: content3 = f3.read() times = content3.split() @@ -239,7 +245,8 @@ content4 = f4.read() times4 = content4.split() for i in range(0,25): - tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('( V'+str(3+i+cnt*30)+'+W'+str(3+i+cnt*30)+'+X'+str(3+i+cnt*30)+'+Y'+str(3+i+cnt*30)+'+Z'+str(3+i+cnt*30)+' )/5' ),style ) + tmSheet.write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) + tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style ) tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style) tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style) From d3f652da48c3240b1c282c91ea333e722e34828e Mon Sep 17 00:00:00 2001 From: Panlichen Date: Thu, 29 Dec 2022 08:07:52 +0000 Subject: [PATCH 091/109] + in order ms --- ofccl_test.sh | 6 ++--- src_manual_size/common_ms.cu | 49 +++++++++++++++++++++++++---------- src_manual_size/common_ms.h | 2 ++ test_scripts/auto_test.py | 15 ++++++----- test_scripts/nccl/run_nccl.sh | 4 +-- 5 files changed, 50 insertions(+), 26 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 33982e7..68b8e8d 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -42,13 +42,13 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=8 + export MY_NUM_DEV=2 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=64M + export NBYTES=64 export WARMITER=2 export MITER=1 export CHECK=0 @@ -70,7 +70,7 @@ elif [ "$BINARY" == "MS" ];then if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi - export NITER=200 + export NITER=4 export SHOW_ALL_PREPARED_COLL=1 export WARMITER=0 export NBYTES=8K diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu index 0ed1041..2b8146c 100644 --- a/src_manual_size/common_ms.cu +++ b/src_manual_size/common_ms.cu @@ -17,20 +17,41 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() #ifdef FULL_MS size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096}; - int idxList[8][MULTI_ITERS] = { - {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, - {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 - }, - {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, - {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 - }, - {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, - {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 - }, - {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, - {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 - } - }; + #ifndef IN_ORDER + int idxList[8][MULTI_ITERS] = { + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + } + }; + #else + int idxList[8][MULTI_ITERS] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + } + }; + #endif #else // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512}; // size_t idxList[8][MULTI_ITERS] = { diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h index 1785efe..14f0ffb 100644 --- a/src_manual_size/common_ms.h +++ b/src_manual_size/common_ms.h @@ -28,6 +28,8 @@ #define MULTI_ITERS 2 #endif +// #define IN_ORDER 1 + #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) #define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 964d5ea..9a6b99f 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -7,8 +7,8 @@ font.height = 20*16 style.font = font # 设置环境变量 -#os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib" -os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib" +os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib" +# os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib" os.environ['NCCL_PROTO'] = "Simple" os.environ['NCCL_ALGO'] = "RING" @@ -19,17 +19,17 @@ os.environ['DEV_TRY_ROUND'] = "10" # 设置超参数 -runNcclTest = False # 运行nccl测试,仅输出原始结果 -staticNccl = False # 运行统计,输出中间结果 +runNcclTest = True # 运行nccl测试,仅输出原始结果 +staticNccl = True # 运行统计,输出中间结果 collectNcclResult = True # 收集nccl测试结果,写入xls -runOfcclTest = False# 运行ofccl测试 -staticOfccl = False # 运行统计,输出中间结果 +runOfcclTest = True# 运行ofccl测试 +staticOfccl = True # 运行统计,输出中间结果 staticOfcclExtral = True # 对ofccl的额外输出进行统计 collectOfcclResult = True# 收集ofccl测试结果,写入xls -DATE="221226" +DATE="221229" NCCL_ORDER="1" host=os.environ.get("HOST") n = 5 @@ -51,6 +51,7 @@ os.system("g++ ./ofccl/static_ofccl_time.cpp -o ./ofccl/static_ofccl_time.out") os.system("g++ ./ofccl/static_ofccl_bw.cpp -o ./ofccl/static_ofccl_bw.out") os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out") +os.system("g++ ./ofccl/static_ofccl_QE_ori.cpp -o ./ofccl/static_ofccl_QE_ori.out") table = xlwt.Workbook() diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh index d69bf7d..890e045 100755 --- a/test_scripts/nccl/run_nccl.sh +++ b/test_scripts/nccl/run_nccl.sh @@ -5,8 +5,8 @@ export NCCL_ALGO=Ring # export NCCL_MIN_NCHANNELS=1 # export NCCL_NTHREADS=64 -export DATE=221225 -export NCCL_ORDER=4 +export DATE=221228 +export NCCL_ORDER=1 for MY_NUM_DEV in 2 4 8 do From 5135aa3cbaeb04eccf3b75b6e5b33a6bf3c8bc61 Mon Sep 17 00:00:00 2001 From: novaCoder-zrk Date: Thu, 29 Dec 2022 08:15:28 +0000 Subject: [PATCH 092/109] =?UTF-8?q?=E8=BE=93=E5=87=BA=20totalCnt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 55 +++++++- test_scripts/ofccl/static_ofccl_totalCnt.cpp | 124 +++++++++++++++++++ 2 files changed, 176 insertions(+), 3 deletions(-) create mode 100644 test_scripts/ofccl/static_ofccl_totalCnt.cpp diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 4f8a1e9..22bd802 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -20,12 +20,12 @@ # 设置超参数 runNcclTest = False # 运行nccl测试,仅输出原始结果 -staticNccl = False # 运行统计,输出中间结果 -collectNcclResult = True # 收集nccl测试结果,写入xls +staticNccl = True # 运行统计,输出中间结果 +collectNcclResult =True # 收集nccl测试结果,写入xls runOfcclTest = False# 运行ofccl测试 -staticOfccl = False # 运行统计,输出中间结果 +staticOfccl = True # 运行统计,输出中间结果 staticOfcclExtral = True # 对ofccl的额外输出进行统计 collectOfcclResult = True# 收集ofccl测试结果,写入xls @@ -52,11 +52,13 @@ os.system("g++ ./ofccl/static_ofccl_bw.cpp -o ./ofccl/static_ofccl_bw.out") os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out") os.system("g++ ./ofccl/static_ofccl_QE_ori.cpp -o ./ofccl/static_ofccl_QE_ori.out") +os.system("g++ ./ofccl/static_ofccl_totalCnt.cpp -o ./ofccl/static_ofccl_totalCnt.out") table = xlwt.Workbook() bwSheet = table.add_sheet('bw') tmSheet = table.add_sheet('time') +cntSheet = table.add_sheet('totalCnt') # 列宽 for i in range(30): bwSheet.col(i).width = 13 * 256 @@ -149,6 +151,7 @@ OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt" OFCCL_OUTPUT_QE_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE.txt" OFCCL_OUTPUT_QE_ORI_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + OFCCL_OUTPUT_TOTALCNT_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" if staticOfccl == True: os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH) @@ -169,6 +172,7 @@ if staticOfcclExtral: os.system("./ofccl/static_ofccl_QE.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_PATH) os.system("./ofccl/static_ofccl_QE_ori.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_ORI_PATH) + os.system("./ofccl/static_ofccl_totalCnt.out "+OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TOTALCNT_PATH) if collectOfcclResult == True: @@ -259,6 +263,51 @@ tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style) tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style) + # cntsheet + cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + axis_y = ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"] + for a in range(0,25): + cntSheet.write(2+a+cnt*30,0,axis_y[a],style) + + cntSheet.write(1+cnt*30,1,"totalCtxSaveCnt_avg",style) + cntSheet.write(1+cnt*30,2,"totalCtxLoadCnt_avg",style) + cntSheet.write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style) + cntSheet.write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style) + cntSheet.write(1+cnt*30,6,"totalCtxSaveCnt",style) + cntSheet.write(1+cnt*30,24,"totalCtxLoadCnt",style) + cntSheet.write(1+cnt*30,42,"totalProgressed7SwithchCnt",style) + cntSheet.write(1+cnt*30,60,"totalUnprogressedQuitCnt",style) + + with open(OFCCL_OUTPUT_TOTALCNT_PATH) as f: + line = f.readline() + # save + for i in range(0,25): + numbers = line.split() + cntSheet.write(i+2+cnt*30,1,numbers[0]) + for j in range(1,len(numbers)): + cntSheet.write(i+2+cnt*30,5+j,numbers[j]) + line = f.readline() + # load + for i in range(0,25): + numbers = line.split() + cntSheet.write(i+2+cnt*30,2,numbers[0]) + for j in range(1,len(numbers)): + cntSheet.write(i+2+cnt*30,23+j,numbers[j]) + line = f.readline() + # totalProgressed7SwithchCnt + for i in range(0,25): + numbers = line.split() + cntSheet.write(i+2+cnt*30,3,numbers[0]) + for j in range(1,len(numbers)): + cntSheet.write(i+2+cnt*30,41+j,numbers[j]) + line = f.readline() + # totalUnprogressedQuitCnt + for i in range(0,25): + numbers = line.split() + cntSheet.write(i+2+cnt*30,4,numbers[0]) + for j in range(1,len(numbers)): + cntSheet.write(i+2+cnt*30,59+j,numbers[j]) + line = f.readline() diff --git a/test_scripts/ofccl/static_ofccl_totalCnt.cpp b/test_scripts/ofccl/static_ofccl_totalCnt.cpp new file mode 100644 index 0000000..c1f78ee --- /dev/null +++ b/test_scripts/ofccl/static_ofccl_totalCnt.cpp @@ -0,0 +1,124 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector> save_ori(25,vector()); + vector> load_ori(25,vector()); + vector> p7s_ori(25,vector()); + vector> quit_ori(25,vector()); + + vector save_avg; + vector load_avg; + vector p7s_avg; + vector quit_avg; + + string bw="bandwidth"; + + int cnt=0; + while(getline(cin, inputLine)){ + if(inputLine.find(bw,0) != -1){ + // 判断结束一个输出 + // save + double sum = accumulate(begin(save_ori[cnt]), end(save_ori[cnt]), 0); + double mean = sum / save_ori[cnt].size(); + save_avg.push_back(mean); + // load + sum = accumulate(begin(load_ori[cnt]), end(load_ori[cnt]),0); + mean = sum / load_ori[cnt].size(); + load_avg.push_back(mean); + // p7s + sum = accumulate(begin(p7s_ori[cnt]), end(p7s_ori[cnt]),0); + mean = sum / p7s_ori[cnt].size(); + p7s_avg.push_back(mean); + // quit + sum = accumulate(begin(quit_ori[cnt]), end(quit_ori[cnt]),0); + mean = sum / quit_ori[cnt].size(); + quit_avg.push_back(mean); + + if(++cnt == 25) + break; + } + + int pos = 0; + // save + while((pos=inputLine.find("totalCtxSaveCnt=",pos) ) != -1){ + pos += 16; + int number = 0; + while(inputLine[pos]>='0' &&inputLine[pos]<='9'){ + number = number*10 + (inputLine[pos]-'0'); + pos++; + } + save_ori[cnt].push_back(number); + } + pos=0; + while((pos=inputLine.find("totalCtxLoadCnt=",pos) ) != -1){ + pos += 16; + int number = 0; + while(inputLine[pos]>='0' &&inputLine[pos]<='9'){ + number = number*10 + (inputLine[pos]-'0'); + pos++; + } + load_ori[cnt].push_back(number); + } + + pos=0; + while((pos=inputLine.find("totalProgressed7SwithchCnt=",pos) ) != -1){ + pos += 27; + int number = 0; + while(inputLine[pos]>='0' &&inputLine[pos]<='9'){ + number = number*10 + (inputLine[pos]-'0'); + pos++; + } + p7s_ori[cnt].push_back(number); + } + + pos=0; + while((pos=inputLine.find("totalUnprogressedQuitCnt=",pos) ) != -1){ + pos += 25; + int number = 0; + while(inputLine[pos]>='0' &&inputLine[pos]<='9'){ + number = number*10 + (inputLine[pos]-'0'); + pos++; + } + quit_ori[cnt].push_back(number); + } + + + } + + + for(int i = 0;i < 25;i++){ + cout << save_avg[i]<<" "; + for(auto num:save_ori[i]) + cout< Date: Thu, 29 Dec 2022 13:50:58 +0000 Subject: [PATCH 093/109] script --- ofccl_test.sh | 2 +- test_scripts/auto_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 68b8e8d..841850d 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -42,7 +42,7 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=2 + export MY_NUM_DEV=8 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 9a6b99f..469396a 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -30,7 +30,7 @@ collectOfcclResult = True# 收集ofccl测试结果,写入xls DATE="221229" -NCCL_ORDER="1" +NCCL_ORDER="3" host=os.environ.get("HOST") n = 5 m = 1 #nccl From 57ee21fc8cf72e854a81d8795dd2904de39ff4c8 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 30 Dec 2022 11:04:02 +0000 Subject: [PATCH 094/109] scripts --- ofccl_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 841850d..68b8e8d 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -42,7 +42,7 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=8 + export MY_NUM_DEV=2 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi From 9ee720221ccd288eaba985e662ff1efb5241e613 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 1 Jan 2023 11:19:41 +0000 Subject: [PATCH 095/109] scripts --- test_scripts/auto_test.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 5cc47ee..de3e4ea 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -29,7 +29,11 @@ staticOfcclExtral = True # 对ofccl的额外输出进行统计 collectOfcclResult = True# 收集ofccl测试结果,写入xls -DATE="221229" +buffer_sizes = ["64", "128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"] + +TINY_TEST = 0 + +DATE="230101" NCCL_ORDER="3" host=os.environ.get("HOST") n = 5 @@ -43,6 +47,13 @@ NUM_DEV = 8 ncards = [2,4,8] +if TINY_TEST == 1: + runNcclTest = False # 运行nccl测试,仅输出原始结果 + staticNccl = False # 运行统计,输出中间结果 + collectNcclResult = False # 收集nccl测试结果,写入xls + ncards = [2] + # buffer_sizes = ["64", "128", "256", "512", "1K"] + resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+str(m)+"n"+str(n)+"w"+str(w)+".xls" # static @@ -90,7 +101,7 @@ NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" if runNcclTest: os.system("echo $(date +%F%n%T)>> "+NCCL_RES_PATH) - for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]: + for a in buffer_sizes: os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RES_PATH) if staticNccl: os.system("./nccl/static_nccl.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) @@ -104,7 +115,7 @@ content = f.read() bw = content.split() - axis_y = ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"] + axis_y = buffer_sizes for a in range(0,25): bwSheet.write(2+a+cnt*30,0,axis_y[a],style) # @@ -138,7 +149,7 @@ # avg tmSheet.write(1+cnt*30, 4, 'avg-nccl',style) for i in range(0,25): - tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') ,style) + tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style) #OFCCL @@ -164,7 +175,7 @@ OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt" if runOfcclTest: os.system("echo $(date +%F%n%T)>> "+OFCCL_RES_PATH) - for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]: + for a in buffer_sizes: os.system("../build/ofccl_all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ OFCCL_RES_PATH) if staticOfccl: os.system("./ofccl/static_ofccl_bw.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH) @@ -193,7 +204,7 @@ bwSheet.write(1+cnt*30,8, 'avg-algbw',style) bwSheet.write(1+cnt*30, 19, 'avg-busbw',style) for i in range(0,25): - bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ,style) + bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) # time @@ -208,7 +219,7 @@ # avg tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style) for i in range(0,25): - tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ,style) + tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) if collectNcclResult and collectOfcclResult: bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style) @@ -216,7 +227,7 @@ tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style) tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style) for i in range(0,25): - bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)) ,style) + bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style) bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style) tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style ) tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style ) @@ -265,7 +276,7 @@ # cntsheet cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) - axis_y = ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"] + axis_y = buffer_sizes for a in range(0,25): cntSheet.write(2+a+cnt*30,0,axis_y[a],style) From 665de439cff3cdae5dc08f3c154d631355b6c23f Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 6 Jan 2023 03:28:59 +0000 Subject: [PATCH 096/109] scripts --- ofccl_test.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 68b8e8d..b48a426 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -46,6 +46,9 @@ if [ "$BINARY" == "DEBUG" ];then if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi + if [ $MY_NUM_DEV = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 + fi export SHOW_ALL_PREPARED_COLL=0 export NITER=5 export NBYTES=64 @@ -66,11 +69,11 @@ elif [ "$BINARY" == "PERF" ];then export CHECK=0 elif [ "$BINARY" == "MS" ];then target="./build/ofccl_all_reduce_ms_perf" - export MY_NUM_DEV=8 + export MY_NUM_DEV=4 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi - export NITER=4 + export NITER=200 export SHOW_ALL_PREPARED_COLL=1 export WARMITER=0 export NBYTES=8K From b5a42cca4cf37ec6f25d9d111c19109a3a4c9bc4 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 6 Jan 2023 03:57:24 +0000 Subject: [PATCH 097/109] scripts --- test_scripts/auto_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index de3e4ea..1f89fe8 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -33,8 +33,8 @@ TINY_TEST = 0 -DATE="230101" -NCCL_ORDER="3" +DATE="230106" +NCCL_ORDER="1" host=os.environ.get("HOST") n = 5 m = 1 #nccl From 197018788d9228921e4f2bbc0e3a277b981477b3 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 8 Jan 2023 14:49:22 +0000 Subject: [PATCH 098/109] scripts --- ofccl_test.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index b48a426..dfefb3a 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -1,7 +1,7 @@ clear -export DEBUG_CC=0 -export DEBUG_ENQ=0 +export DEBUG_CC=1 +export DEBUG_ENQ=1 cd /home/panlichen/work2/nccl-tests export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib @@ -95,7 +95,7 @@ if [ "$RUN_TYPE" == "PURE" ];then cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" elif [ "$RUN_TYPE" == "GDB" ];then cmd="cuda-gdb $target" - # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 + # set args -b 64 -e 64 -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 elif [ "$RUN_TYPE" == "NSYS" ];then cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" elif [ "$RUN_TYPE" == "NCU" ];then From ceb3a5aaf8d5de8138bff022c73d5bd62a823f02 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 10 Jan 2023 09:57:36 +0000 Subject: [PATCH 099/109] scripts --- ofccl_test.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index dfefb3a..dcd4868 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -18,6 +18,10 @@ export TOLERANT_UNPROGRESSED_CNT=10000 export BASE_CTX_SWITCH_THRESHOLD=80 export BOUNS_SWITCH_4_PROCESSED_COLL=0 export DEV_TRY_ROUND=10 +export DEBUG_FILE="/home/panlichen/work2/ofccl/log/oneflow_cpu_rank_" + +rm -rf /home/panlichen/work2/ofccl/log +mkdir -p /home/panlichen/work2/ofccl/log # export ENABLE_VQ=1 # volunteer quit # export TOLERANT_FAIL_CHECK_SQ_CNT=5000 @@ -28,6 +32,7 @@ echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo DEBUG_FILE=$DEBUG_FILE if [ ! -z $ENABLE_VQ ];then echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT @@ -51,7 +56,7 @@ if [ "$BINARY" == "DEBUG" ];then fi export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=64 + export NBYTES=2 export WARMITER=2 export MITER=1 export CHECK=0 From ac30fd49fa15a9131b03ea6ad7d3f2d19fcd8659 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Wed, 11 Jan 2023 09:55:49 +0000 Subject: [PATCH 100/109] datatype in cmd; MY_NUM_DEV as cmd line param --- ofccl_test.sh | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index dcd4868..9302e61 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -1,5 +1,7 @@ clear +export MY_NUM_DEV=$1 + export DEBUG_CC=1 export DEBUG_ENQ=1 @@ -47,7 +49,6 @@ fi if [ "$BINARY" == "DEBUG" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=2 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi @@ -62,7 +63,6 @@ if [ "$BINARY" == "DEBUG" ];then export CHECK=0 elif [ "$BINARY" == "PERF" ];then target="./build/ofccl_all_reduce_perf" - export MY_NUM_DEV=8 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi @@ -74,7 +74,6 @@ elif [ "$BINARY" == "PERF" ];then export CHECK=0 elif [ "$BINARY" == "MS" ];then target="./build/ofccl_all_reduce_ms_perf" - export MY_NUM_DEV=4 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi @@ -96,8 +95,42 @@ if [ -z $RUN_TYPE ];then # RUN_TYPE="NCU" fi +# typedef enum { ncclInt8 = 0, ncclChar = 0, +# ncclUint8 = 1, +# ncclInt32 = 2, ncclInt = 2, +# ncclUint32 = 3, +# ncclInt64 = 4, +# ncclUint64 = 5, +# ncclFloat16 = 6, ncclHalf = 6, +# ncclFloat32 = 7, ncclFloat = 7, +# ncclFloat64 = 8, ncclDouble = 8, +# #if defined(__CUDA_BF16_TYPES_EXIST__) +# ncclBfloat16 = 9, +# ncclNumTypes = 10 +# #else +# ncclNumTypes = 9 +# #endif +# } ncclDataType_t; + +# 用这个: +# const char *test_typenames[ncclNumTypes] = {"int8", +# "uint8", +# "int32", +# "uint32", +# "int64", +# "uint64", +# "half", +# "float", +# "double" +# #if defined(__CUDA_BF16_TYPES_EXIST__) && \ +# NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) +# , +# "bfloat16" +# #endif +# }; + if [ "$RUN_TYPE" == "PURE" ];then - cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" + cmd="$target -d half -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" elif [ "$RUN_TYPE" == "GDB" ];then cmd="cuda-gdb $target" # set args -b 64 -e 64 -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 From 82c2a8e19b957cd30e4e2d08b27bf2f0ab174fee Mon Sep 17 00:00:00 2001 From: Panlichen Date: Fri, 13 Jan 2023 09:18:47 +0000 Subject: [PATCH 101/109] scripts --- ofccl_test.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 9302e61..f080213 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -20,6 +20,7 @@ export TOLERANT_UNPROGRESSED_CNT=10000 export BASE_CTX_SWITCH_THRESHOLD=80 export BOUNS_SWITCH_4_PROCESSED_COLL=0 export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 export DEBUG_FILE="/home/panlichen/work2/ofccl/log/oneflow_cpu_rank_" rm -rf /home/panlichen/work2/ofccl/log @@ -34,6 +35,7 @@ echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL echo DEBUG_FILE=$DEBUG_FILE if [ ! -z $ENABLE_VQ ];then @@ -57,7 +59,7 @@ if [ "$BINARY" == "DEBUG" ];then fi export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=2 + export NBYTES=64 export WARMITER=2 export MITER=1 export CHECK=0 From c760c82589e32f259af2853f498847dfe0218b9f Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sat, 14 Jan 2023 14:21:15 +0000 Subject: [PATCH 102/109] + occl AllGather --- nccl_test.sh | 27 ++++-- ofccl_test.sh | 34 +++++--- src_simple/Makefile | 3 +- src_simple/common_simple.h | 1 + src_simple/ofccl_all_gather.cu | 151 +++++++++++++++++++++++++++++++++ 5 files changed, 194 insertions(+), 22 deletions(-) create mode 100644 src_simple/ofccl_all_gather.cu diff --git a/nccl_test.sh b/nccl_test.sh index 80a203f..de799b2 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -1,5 +1,7 @@ clear +export MY_NUM_DEV=$1 + cd /home/panlichen/work2/nccl-tests export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib export NCCL_PROTO=Simple @@ -14,33 +16,40 @@ if [ -z $BINARY ];then # BINARY="PERF" fi -if [ "$BINARY" == "DEBUG" ];then +FUNC=$2 + +if [ "$FUNC" == "AR" ]; then target="./build/all_reduce_perf" - export MY_NUM_DEV=2 +elif [ "$FUNC" == "AG" ]; then + target="./build/all_gather_perf" +elif [ "$FUNC" == "RS" ]; then + target="./build/reduce_scatter_perf" +elif [ "$FUNC" == "R" ]; then + target="./build/reduce_perf" +elif [ "$FUNC" == "B" ]; then + target="./build/broadcast_perf" +fi + + +if [ "$BINARY" == "DEBUG" ];then if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi - export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=64 + export NBYTES=1G export WARMITER=2 export MITER=1 export CHECK=0 elif [ "$BINARY" == "PERF" ];then - target="./build/all_reduce_perf" - export MY_NUM_DEV=8 if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi - export SHOW_ALL_PREPARED_COLL=0 export NITER=4 export NBYTES=8K export WARMITER=2 export MITER=4 export CHECK=0 elif [ "$BINARY" == "MS" ];then - export MY_NUM_DEV=8 - # target="./build/ofccl_all_reduce_ms_perf" if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi diff --git a/ofccl_test.sh b/ofccl_test.sh index f080213..514b756 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -9,11 +9,12 @@ cd /home/panlichen/work2/nccl-tests export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib export NCCL_PROTO=Simple export NCCL_ALGO=Ring -# export NCCL_MAX_NCHANNELS=1 -# export NCCL_MIN_NCHANNELS=1 -# export NCCL_NTHREADS=64 +export NCCL_MAX_NCHANNELS=1 +export NCCL_MIN_NCHANNELS=1 +export NCCL_NTHREADS=64 export CHECK=0 +export SHOW_ALL_PREPARED_COLL=0 export TRAVERSE_TIMES=10 export TOLERANT_UNPROGRESSED_CNT=10000 @@ -43,6 +44,23 @@ if [ ! -z $ENABLE_VQ ];then echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT fi +FUNC=$2 +if [ -z $FUNC ]; then + FUNC="AR" +fi + +if [ "$FUNC" == "AR" ]; then + target="./build/ofccl_all_reduce_perf" +elif [ "$FUNC" == "AG" ]; then + target="./build/ofccl_all_gather_perf" +elif [ "$FUNC" == "RS" ]; then + target="./build/ofccl_reduce_scatter_perf" +elif [ "$FUNC" == "R" ]; then + target="./build/ofccl_reduce_perf" +elif [ "$FUNC" == "B" ]; then + target="./build/ofccl_broadcast_perf" +fi + if [ -z $BINARY ];then BINARY="DEBUG" # BINARY="MS" @@ -50,30 +68,24 @@ if [ -z $BINARY ];then fi if [ "$BINARY" == "DEBUG" ];then - target="./build/ofccl_all_reduce_perf" if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi if [ $MY_NUM_DEV = 2 ]; then export CUDA_VISIBLE_DEVICES=4,5 fi - export SHOW_ALL_PREPARED_COLL=0 export NITER=5 - export NBYTES=64 + export NBYTES=1G export WARMITER=2 export MITER=1 - export CHECK=0 elif [ "$BINARY" == "PERF" ];then - target="./build/ofccl_all_reduce_perf" if [ $MY_NUM_DEV = 4 ]; then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi - export SHOW_ALL_PREPARED_COLL=0 export NITER=8 export NBYTES=8K export WARMITER=2 export MITER=1 - export CHECK=0 elif [ "$BINARY" == "MS" ];then target="./build/ofccl_all_reduce_ms_perf" if [ $MY_NUM_DEV = 4 ]; then @@ -132,7 +144,7 @@ fi # }; if [ "$RUN_TYPE" == "PURE" ];then - cmd="$target -d half -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" + cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" # -d half elif [ "$RUN_TYPE" == "GDB" ];then cmd="cuda-gdb $target" # set args -b 64 -e 64 -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 diff --git a/src_simple/Makefile b/src_simple/Makefile index ccad131..c007331 100644 --- a/src_simple/Makefile +++ b/src_simple/Makefile @@ -89,8 +89,7 @@ $(info CARDNAME $(NVCUFLAGS)) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -# BIN_FILES_LIST := all_reduce_group all_reduce_simple ofccl_all_reduce -BIN_FILES_LIST := ofccl_all_reduce +BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index 8801172..9236d77 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -74,6 +74,7 @@ typedef enum { typedef struct { int collId; int gotCqe; + // int cqeCnt; pthread_mutex_t mutex; } CallBackArgs; diff --git a/src_simple/ofccl_all_gather.cu b/src_simple/ofccl_all_gather.cu new file mode 100644 index 0000000..26fd9bb --- /dev/null +++ b/src_simple/ofccl_all_gather.cu @@ -0,0 +1,151 @@ +#include "cuda_runtime.h" +#include "common_simple.h" +#include +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s", size, count, typeName); +} + +void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count/nranks; + *recvcount = (count/nranks)*nranks; + *sendInplaceOffset = count/nranks; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks - 1))/((double)nranks); + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId); + + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + return 0; +} + +testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + NCCLCHECK(ofcclRunAllGather(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllGather for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllGather sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareAllGather(count, datatype, op, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllGather with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl allGatherTest = { + "AllGather", + AllGatherGetCollByteCount, + AllGatherInitData, + AllGatherGetBw, + AllGatherRunColl, + AllGatherPrepare +}; + +void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllGatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allGatherTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i Date: Sat, 14 Jan 2023 15:08:38 +0000 Subject: [PATCH 103/109] 5555 remove NCCL_MIN_NCHANNELS limit T^T TAT T_T T-T --- ofccl_test.sh | 6 +++--- test_scripts/auto_test.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ofccl_test.sh b/ofccl_test.sh index 514b756..d94f55b 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -9,9 +9,9 @@ cd /home/panlichen/work2/nccl-tests export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib export NCCL_PROTO=Simple export NCCL_ALGO=Ring -export NCCL_MAX_NCHANNELS=1 -export NCCL_MIN_NCHANNELS=1 -export NCCL_NTHREADS=64 +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 export CHECK=0 export SHOW_ALL_PREPARED_COLL=0 diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 1f89fe8..ac5f5f7 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -33,7 +33,7 @@ TINY_TEST = 0 -DATE="230106" +DATE="230114" NCCL_ORDER="1" host=os.environ.get("HOST") n = 5 From bbb04c67cff6a4034e444cc598caa366fd7500ba Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 15 Jan 2023 03:09:53 +0000 Subject: [PATCH 104/109] + ofccl ReduceScatter --- ofccl_test.sh | 2 +- src_simple/Makefile | 2 +- src_simple/ofccl_reduce_scatter.cu | 153 +++++++++++++++++++++++++++++ 3 files changed, 155 insertions(+), 2 deletions(-) create mode 100644 src_simple/ofccl_reduce_scatter.cu diff --git a/ofccl_test.sh b/ofccl_test.sh index d94f55b..f3270ab 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -75,7 +75,7 @@ if [ "$BINARY" == "DEBUG" ];then export CUDA_VISIBLE_DEVICES=4,5 fi export NITER=5 - export NBYTES=1G + export NBYTES=64M export WARMITER=2 export MITER=1 elif [ "$BINARY" == "PERF" ];then diff --git a/src_simple/Makefile b/src_simple/Makefile index c007331..cc93c28 100644 --- a/src_simple/Makefile +++ b/src_simple/Makefile @@ -89,7 +89,7 @@ $(info CARDNAME $(NVCUFLAGS)) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather +BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src_simple/ofccl_reduce_scatter.cu b/src_simple/ofccl_reduce_scatter.cu new file mode 100644 index 0000000..44c3c10 --- /dev/null +++ b/src_simple/ofccl_reduce_scatter.cu @@ -0,0 +1,153 @@ +#include "cuda_runtime.h" +#include "common_simple.h" +#include +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = (count/nranks)*nranks; + *recvcount = count/nranks; + *sendInplaceOffset = 0; + *recvInplaceOffset = count/nranks; + *paramcount = *recvcount; +} + +testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks - 1))/((double)nranks); + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId); + + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + return 0; +} + +testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + NCCLCHECK(ofcclRunReduceScatter(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduceScatter for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduceScatter sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t ReduceScatterPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareReduceScatter(count, datatype, op, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareReduceScatter with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl reduceScatterTest = { + "ReduceScatter", + ReduceScatterGetCollByteCount, + ReduceScatterInitData, + ReduceScatterGetBw, + ReduceScatterRunColl, + ReduceScatterPrepare +}; + +void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + ReduceScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &reduceScatterTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i Date: Sun, 15 Jan 2023 04:48:00 +0000 Subject: [PATCH 105/109] + occl reduce --- src_simple/Makefile | 2 +- src_simple/common_simple.cu | 2 +- src_simple/common_simple.h | 2 +- src_simple/ofccl_all_gather.cu | 2 +- src_simple/ofccl_all_reduce.cu | 2 +- src_simple/ofccl_reduce.cu | 159 +++++++++++++++++++++++++++++ src_simple/ofccl_reduce_scatter.cu | 2 +- 7 files changed, 165 insertions(+), 6 deletions(-) create mode 100644 src_simple/ofccl_reduce.cu diff --git a/src_simple/Makefile b/src_simple/Makefile index cc93c28..1812202 100644 --- a/src_simple/Makefile +++ b/src_simple/Makefile @@ -89,7 +89,7 @@ $(info CARDNAME $(NVCUFLAGS)) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter +BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter ofccl_reduce BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu index 52d6be6..fc1d809 100644 --- a/src_simple/common_simple.cu +++ b/src_simple/common_simple.cu @@ -666,7 +666,7 @@ testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, &op, &u64, type, ncclScalarHostImmediate, comm)); } #endif - TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx)); + TESTCHECK(args->collTest->prepareColl(count, type, op, root, comm, miter, rankCtx)); #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) if (opIndex >= ncclNumOps) { diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h index 9236d77..daba610 100644 --- a/src_simple/common_simple.h +++ b/src_simple/common_simple.h @@ -90,7 +90,7 @@ struct testColl { ncclRedOp_t op, int root, int rep, int in_place); void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx); - testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx); + testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx); }; extern struct testColl allReduceTest; extern struct testColl allGatherTest; diff --git a/src_simple/ofccl_all_gather.cu b/src_simple/ofccl_all_gather.cu index 26fd9bb..b22aab9 100644 --- a/src_simple/ofccl_all_gather.cu +++ b/src_simple/ofccl_all_gather.cu @@ -86,7 +86,7 @@ testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, int collId, CallBa return testSuccess; } -testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { +testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { NCCLCHECK(ofcclPrepareAllGather(count, datatype, op, comm, collId, rankCtx)); // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllGather with count=%lu, collId=%d", pthread_self(), count, collId); diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu index 50aaad8..7dd65d9 100644 --- a/src_simple/ofccl_all_reduce.cu +++ b/src_simple/ofccl_all_reduce.cu @@ -96,7 +96,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa return testSuccess; } -testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { +testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx)); // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); diff --git a/src_simple/ofccl_reduce.cu b/src_simple/ofccl_reduce.cu new file mode 100644 index 0000000..33db29c --- /dev/null +++ b/src_simple/ofccl_reduce.cu @@ -0,0 +1,159 @@ +#include "cuda_runtime.h" +#include "common_simple.h" +#include +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s %6i", size, count, typeName, opName, root); +} + +void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); + if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + *algBw = baseBw; + *busBw = baseBw; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId); + + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + return 0; +} + +testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + NCCLCHECK(ofcclRunReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t ReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareReduce(count, datatype, op, root, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareReduce with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl reduceTest = { + "Reduce", + ReduceGetCollByteCount, + ReduceInitData, + ReduceGetBw, + ReduceRunColl, + ReducePrepare +}; + +void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + ReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &reduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + int begin_root, end_root; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; + } + + for (int i=0; i invoke ofcclPrepareReduceScatter with count=%lu, collId=%d", pthread_self(), count, collId); From 6c075fd5521492d1c337e24205a551e36e490a36 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Sun, 15 Jan 2023 11:16:14 +0000 Subject: [PATCH 106/109] +ofccl_broadcast; fix DEBUG_NT --- nccl_test.sh | 2 +- ofccl_test.sh | 6 ++ src/Makefile | 4 +- src_inplace/Makefile | 4 +- src_manual_size/Makefile | 4 +- src_nccl_manual_size/Makefile | 4 +- src_simple/Makefile | 6 +- src_simple/ofccl_all_gather.cu | 2 +- src_simple/ofccl_broadcast.cu | 146 +++++++++++++++++++++++++++++++++ 9 files changed, 165 insertions(+), 13 deletions(-) create mode 100644 src_simple/ofccl_broadcast.cu diff --git a/nccl_test.sh b/nccl_test.sh index de799b2..1435e51 100644 --- a/nccl_test.sh +++ b/nccl_test.sh @@ -36,7 +36,7 @@ if [ "$BINARY" == "DEBUG" ];then export CUDA_VISIBLE_DEVICES=0,1,4,5 fi export NITER=5 - export NBYTES=1G + export NBYTES=64M export WARMITER=2 export MITER=1 export CHECK=0 diff --git a/ofccl_test.sh b/ofccl_test.sh index f3270ab..1e62664 100644 --- a/ofccl_test.sh +++ b/ofccl_test.sh @@ -5,6 +5,12 @@ export MY_NUM_DEV=$1 export DEBUG_CC=1 export DEBUG_ENQ=1 +unset DEBUG_CC +unset DEBUG_ENQ + +export DEBUG_NT=1 +unset DEBUG_NT + cd /home/panlichen/work2/nccl-tests export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib export NCCL_PROTO=Simple diff --git a/src/Makefile b/src/Makefile index 8cee9d8..5927cc2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,7 +7,7 @@ CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 -DEBUG ?= 1 +DEBUG_NT ?= 0 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -51,7 +51,7 @@ NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 LDFLAGS := -L${CUDA_LIB} -lcudart -lrt NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt -ifeq ($(DEBUG), 0) +ifeq ($(DEBUG_NT), 0) NVCUFLAGS += -O3 -g CXXFLAGS += -O3 -g else diff --git a/src_inplace/Makefile b/src_inplace/Makefile index 8b0e124..840c997 100644 --- a/src_inplace/Makefile +++ b/src_inplace/Makefile @@ -7,7 +7,7 @@ CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 -DEBUG ?= 1 +DEBUG_NT ?= 0 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -51,7 +51,7 @@ NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 LDFLAGS := -L${CUDA_LIB} -lcudart -lrt NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt -ifeq ($(DEBUG), 0) +ifeq ($(DEBUG_NT), 0) NVCUFLAGS += -O3 -g CXXFLAGS += -O3 -g else diff --git a/src_manual_size/Makefile b/src_manual_size/Makefile index ce42152..363ce69 100644 --- a/src_manual_size/Makefile +++ b/src_manual_size/Makefile @@ -7,7 +7,7 @@ CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 -DEBUG ?= 1 +DEBUG_NT ?= 0 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -51,7 +51,7 @@ NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 LDFLAGS := -L${CUDA_LIB} -lcudart -lrt NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt -ifeq ($(DEBUG), 0) +ifeq ($(DEBUG_NT), 0) NVCUFLAGS += -O3 -g CXXFLAGS += -O3 -g else diff --git a/src_nccl_manual_size/Makefile b/src_nccl_manual_size/Makefile index 4a67159..3851d9d 100644 --- a/src_nccl_manual_size/Makefile +++ b/src_nccl_manual_size/Makefile @@ -7,7 +7,7 @@ CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 -DEBUG ?= 1 +DEBUG_NT ?= 0 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -51,7 +51,7 @@ NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 LDFLAGS := -L${CUDA_LIB} -lcudart -lrt NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt -ifeq ($(DEBUG), 0) +ifeq ($(DEBUG_NT), 0) NVCUFLAGS += -O3 -g CXXFLAGS += -O3 -g else diff --git a/src_simple/Makefile b/src_simple/Makefile index 1812202..2206f40 100644 --- a/src_simple/Makefile +++ b/src_simple/Makefile @@ -7,7 +7,7 @@ CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 -DEBUG ?= 1 +DEBUG_NT ?= 0 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -51,7 +51,7 @@ NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 LDFLAGS := -L${CUDA_LIB} -lcudart -lrt NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt -ifeq ($(DEBUG), 0) +ifeq ($(DEBUG_NT), 0) NVCUFLAGS += -O3 -g CXXFLAGS += -O3 -g else @@ -89,7 +89,7 @@ $(info CARDNAME $(NVCUFLAGS)) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter ofccl_reduce +BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter ofccl_reduce ofccl_broadcast BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src_simple/ofccl_all_gather.cu b/src_simple/ofccl_all_gather.cu index b22aab9..6cf8ddf 100644 --- a/src_simple/ofccl_all_gather.cu +++ b/src_simple/ofccl_all_gather.cu @@ -88,7 +88,7 @@ testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, int collId, CallBa testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { - NCCLCHECK(ofcclPrepareAllGather(count, datatype, op, comm, collId, rankCtx)); + NCCLCHECK(ofcclPrepareAllGather(count, datatype, comm, collId, rankCtx)); // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllGather with count=%lu, collId=%d", pthread_self(), count, collId); return testSuccess; } diff --git a/src_simple/ofccl_broadcast.cu b/src_simple/ofccl_broadcast.cu new file mode 100644 index 0000000..4a2b217 --- /dev/null +++ b/src_simple/ofccl_broadcast.cu @@ -0,0 +1,146 @@ +#include "cuda_runtime.h" +#include "common_simple.h" +#include +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6i", size, count, typeName, root); +} + +void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId); + + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + return 0; +} + +testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + NCCLCHECK(ofcclRunBroadcast(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunBroadcast for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunBroadcast sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t BroadcastPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareBroadcast(count, datatype, root, comm, collId, rankCtx)); + OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareBroadcast with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl broadcastTest = { + "Broadcast", + BroadcastGetCollByteCount, + BroadcastInitData, + BroadcastGetBw, + BroadcastRunColl, + BroadcastPrepare +}; + +void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + BroadcastGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &broadcastTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + int begin_root, end_root; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; + } + + for (int i=0; i Date: Thu, 19 Jan 2023 03:28:17 +0000 Subject: [PATCH 107/109] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E5=89=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 285 +++++++++++++++++++++++--------------- 1 file changed, 177 insertions(+), 108 deletions(-) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index ac5f5f7..566ff3e 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -7,8 +7,8 @@ font.height = 20*16 style.font = font # 设置环境变量 -os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib" -# os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib" +#os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib" +os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib" os.environ['NCCL_PROTO'] = "Simple" os.environ['NCCL_ALGO'] = "RING" @@ -21,19 +21,19 @@ # 设置超参数 runNcclTest = True # 运行nccl测试,仅输出原始结果 staticNccl = True # 运行统计,输出中间结果 -collectNcclResult = True # 收集nccl测试结果,写入xls +collectNcclResult = False # 收集nccl测试结果,写入xls -runOfcclTest = True# 运行ofccl测试 -staticOfccl = True # 运行统计,输出中间结果 -staticOfcclExtral = True # 对ofccl的额外输出进行统计 -collectOfcclResult = True# 收集ofccl测试结果,写入xls +runOfcclTest = False# 运行ofccl测试 +staticOfccl = False # 运行统计,输出中间结果 +staticOfcclExtral = False # 对ofccl的额外输出进行统计 +collectOfcclResult = False# 收集ofccl测试结果,写入xls buffer_sizes = ["64", "128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"] TINY_TEST = 0 -DATE="230114" +DATE="230118" NCCL_ORDER="1" host=os.environ.get("HOST") n = 5 @@ -67,13 +67,29 @@ table = xlwt.Workbook() -bwSheet = table.add_sheet('bw') -tmSheet = table.add_sheet('time') -cntSheet = table.add_sheet('totalCnt') +AR['bwSheet'] = table.add_sheet('allReduce_bw') +AR['tmShee'] = table.add_sheet('allReduce_time') +AR['cntSheet'] = table.add_sheet('allReduce_totalCnt') + +AG['bwSheet'] = table.add_sheet('allGather_bw') +AG['tmSheet'] = table.add_sheet('allGather_time') +AG['cntSheet'] = table.add_sheet('allGather_totalCnt') + +B['bwSheet'] = table.add_sheet('broadcast_bw') +B['tmSheet'] = table.add_sheet('broadcast_time') +B['cntSheet'] = table.add_sheet('broadcast_totalCnt') + +R['bwSheet'] = table.add_sheet('reduce_bw') +R['tmSheet'] = table.add_sheet('reduce_time') +R['cntSheet'] = table.add_sheet('reduce_totalCnt') + +RS['bwSheet'] = table.add_sheet('reduceScatter_bw') +RS['tmSheet'] = table.add_sheet('reduceScatter_time') +RS['cntSheet'] = table.add_sheet('reduceScatter_totalCnt') # 列宽 -for i in range(30): - bwSheet.col(i).width = 13 * 256 - tmSheet.col(i).width = 16 * 256 +# for i in range(30): +# AR['bwSheet'].col(i).width = 13 * 256 +# AR_tmSheet.col(i).width = 16 * 256 cnt = 0 for MY_NUM_DEV in ncards: @@ -87,69 +103,122 @@ NCCL_RES_DIR ="./nccl/test_result_"+DATE+"_"+NCCL_ORDER+"_"+str(MY_NUM_DEV)+"cards" if not os.path.exists(NCCL_RES_DIR): os.makedirs(NCCL_RES_DIR) - # 统计结果 - NCCL_OUTPUT_BW_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards.txt" - NCCL_OUTPUT_TIME_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards_time.txt" - + # 统计结果 + # allReduce + AR['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_allReduce_"+str(MY_NUM_DEV)+"cards.txt" + AR['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_allReduce_"+str(MY_NUM_DEV)+"cards_time.txt" + # allGather + AG['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_allGather_"+str(MY_NUM_DEV)+"cards.txt" + AG['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_allGather_"+str(MY_NUM_DEV)+"cards_time.txt" + # broadcast + B['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_broadcast_"+str(MY_NUM_DEV)+"cards.txt" + B['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_broadcast_"+str(MY_NUM_DEV)+"cards_time.txt" + # reduce + R['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_reduce_"+str(MY_NUM_DEV)+"cards.txt" + R['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_reduce_"+str(MY_NUM_DEV)+"cards_time.txt" + # reduceScatter + RS['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_reduceScatter_"+str(MY_NUM_DEV)+"cards.txt" + RS['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_time.txt" if staticNccl == True: + for op in [AR,AG,B,R,RS]: + os.system("echo $(date +%F%n%T)>>"+op['nccl_bw_path']) + os.system("echo $(date +%F%n%T)>>"+op['nccl_time_path']) + + os.system("echo $(date +%F%n%T)>>"+NCCL_AG_BW_PATH) + os.system("echo $(date +%F%n%T)>>"+NCCL_AG_TIME_PATH) + + os.system("echo $(date +%F%n%T)>>"+NCCL_B_BW_PATH) + os.system("echo $(date +%F%n%T)>>"+NCCL_B_TIME_PATH) + + os.system("echo $(date +%F%n%T)>>"+NCCL_R_BW_PATH) + os.system("echo $(date +%F%n%T)>>"+NCCL_R_TIME_PATH) + + os.system("echo $(date +%F%n%T)>>"+NCCL_RS_BW_PATH) + os.system("echo $(date +%F%n%T)>>"+NCCL_RS_TIME_PATH) - os.system("echo $(date +%F%n%T)>>"+NCCL_OUTPUT_BW_PATH) - os.system("echo $(date +%F%n%T)>>"+NCCL_OUTPUT_TIME_PATH) for iter in [1,2,3]: - NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + # raw data + NCCL_AR = NCCL_RES_DIR+"/nccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + NCCL_AG = NCCL_RES_DIR+"/nccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + NCCL_B = NCCL_RES_DIR+"/nccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + NCCL_R = NCCL_RES_DIR+"/nccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + NCCL_RS = NCCL_RES_DIR+"/nccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + if runNcclTest: - os.system("echo $(date +%F%n%T)>> "+NCCL_RES_PATH) + os.system("echo $(date +%F%n%T)>> "+NCCL_AR) + os.system("echo $(date +%F%n%T)>> "+NCCL_AG) + os.system("echo $(date +%F%n%T)>> "+NCCL_B) + os.system("echo $(date +%F%n%T)>> "+NCCL_R) + os.system("echo $(date +%F%n%T)>> "+NCCL_RS) + for a in buffer_sizes: - os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RES_PATH) + os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_AR) + os.system("../build/all_gather_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_AG) + os.system("../build/broadcast_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_B) + os.system("../build/reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_R) + os.system("../build/reduce_scatter_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RS) + if staticNccl: - os.system("./nccl/static_nccl.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) - os.system("./nccl/static_time.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) + os.system("./nccl/static_nccl.out " +NCCL_AR+" " +NCCL_AR_BW_PATH+" "+str(MY_NUM_DEV)) + os.system("./nccl/static_time.out " +NCCL_AR+" " +NCCL_AR_TIME_PATH+" "+str(MY_NUM_DEV)) + + os.system("./nccl/static_nccl.out " +NCCL_AG+" " +NCCL_AG_BW_PATH+" "+str(MY_NUM_DEV)) + os.system("./nccl/static_time.out " +NCCL_AG+" " +NCCL_AG_TIME_PATH+" "+str(MY_NUM_DEV)) + + os.system("./nccl/static_nccl.out " +NCCL_B+" " +NCCL_B_BW_PATH+" "+str(MY_NUM_DEV)) + os.system("./nccl/static_time.out " +NCCL_B+" " +NCCL_B_TIME_PATH+" "+str(MY_NUM_DEV)) + + os.system("./nccl/static_nccl.out " +NCCL_R+" " +NCCL_R_BW_PATH+" "+str(MY_NUM_DEV)) + os.system("./nccl/static_time.out " +NCCL_R+" " +NCCL_R_TIME_PATH+" "+str(MY_NUM_DEV)) + + os.system("./nccl/static_nccl.out " +NCCL_RS+" " +NCCL_RS_BW_PATH+" "+str(MY_NUM_DEV)) + os.system("./nccl/static_time.out " +NCCL_RS+" " +NCCL_RS_TIME_PATH+" "+str(MY_NUM_DEV)) if collectNcclResult == True : # bus - bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + AR_bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) - with open(NCCL_OUTPUT_BW_PATH) as f: + with open(NCCL_AR_BW_PATH) as f: content = f.read() bw = content.split() axis_y = buffer_sizes for a in range(0,25): - bwSheet.write(2+a+cnt*30,0,axis_y[a],style) + AR_bwSheet.write(2+a+cnt*30,0,axis_y[a],style) # for k in [0,1,2]: - bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k),style) + AR_bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k),style) for i in range(0,25): - bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2],style) + AR_bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2],style) - bwSheet.write(1+cnt*30,12+k,'nccl-busbw'+str(k),style) + AR_bwSheet.write(1+cnt*30,12+k,'nccl-busbw'+str(k),style) for i in range(0,25): - bwSheet.write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style) + AR_bwSheet.write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style) # avg - bwSheet.write(1+cnt*30, 4, 'avg-algbw',style) - bwSheet.write(1+cnt*30, 15, 'avg-busbw',style) + AR_bwSheet.write(1+cnt*30, 4, 'avg-algbw',style) + AR_bwSheet.write(1+cnt*30, 15, 'avg-busbw',style) for i in range(0,25): - bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style ) - bwSheet.write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) + AR_bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style ) + AR_bwSheet.write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) # time - with open(NCCL_OUTPUT_TIME_PATH) as f2: + with open(NCCL_AR_TIME_PATH) as f2: content2 = f2.read() times = content2.split() - tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + AR_tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) for a in range(0,25): - tmSheet.write(2+a+cnt*30,0,axis_y[a],style) + AR_tmSheet.write(2+a+cnt*30,0,axis_y[a],style) for k in [0,1,2]: - tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k),style) + AR_tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k),style) for i in range(0,25): - tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2],style) + AR_tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2],style) # avg - tmSheet.write(1+cnt*30, 4, 'avg-nccl',style) + AR_tmSheet.write(1+cnt*30, 4, 'avg-nccl',style) for i in range(0,25): - tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style) + AR_tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style) #OFCCL @@ -193,19 +262,19 @@ bw = content2.split() #bus for k in [0,1,2]: - bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style) + AR_bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style) for i in range(0,25): - bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2],style) + AR_bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2],style) - bwSheet.write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style) + AR_bwSheet.write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style) for i in range(0,25): - bwSheet.write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style) + AR_bwSheet.write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style) # avg - bwSheet.write(1+cnt*30,8, 'avg-algbw',style) - bwSheet.write(1+cnt*30, 19, 'avg-busbw',style) + AR_bwSheet.write(1+cnt*30,8, 'avg-algbw',style) + AR_bwSheet.write(1+cnt*30, 19, 'avg-busbw',style) for i in range(0,25): - bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) - bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) + AR_bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) + AR_bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) # time with open(OFCCL_OUTPUT_TIME_PATH) as f2: @@ -213,44 +282,44 @@ times = content2.split() for k in [0,1,2]: - tmSheet.write(1+cnt*30,5+k,'ofccl-'+str(k),style) + AR_tmSheet.write(1+cnt*30,5+k,'ofccl-'+str(k),style) for i in range(0,25): - tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2],style) + AR_tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2],style) # avg - tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style) + AR_tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style) for i in range(0,25): - tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) + AR_tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) if collectNcclResult and collectOfcclResult: - bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style) - bwSheet.write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style) - tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style) - tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style) + AR_bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style) + AR_bwSheet.write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style) + AR_tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style) + AR_tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style) for i in range(0,25): - bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style) - bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style) - tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style ) - tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style ) + AR_bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style) + AR_bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style) + AR_tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style ) + AR_tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style ) # time 各个列的标题 if staticOfcclExtral: - tmSheet.write(1+cnt*30, 13,'nccl IO',style ) - tmSheet.write(1+cnt*30, 14,'nccl kern',style ) - tmSheet.write(1+cnt*30, 15,'ofccl-nccl kern',style ) - tmSheet.write(1+cnt*30, 16,'before after get sqe',style ) - tmSheet.write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style ) - tmSheet.write(1+cnt*30, 18,'before after put cqe',style ) - tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style ) - tmSheet.write(1+cnt*30, 20,'occl rank0 time',style ) - tmSheet.write(1+cnt*30, 21,'nccl kern ori',style ) - tmSheet.write(1+cnt*30, 27,'before after get sqe ori',style ) - tmSheet.write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style ) - tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style ) - tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style ) + AR_tmSheet.write(1+cnt*30, 13,'nccl IO',style ) + AR_tmSheet.write(1+cnt*30, 14,'nccl kern',style ) + AR_tmSheet.write(1+cnt*30, 15,'ofccl-nccl kern',style ) + AR_tmSheet.write(1+cnt*30, 16,'before after get sqe',style ) + AR_tmSheet.write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style ) + AR_tmSheet.write(1+cnt*30, 18,'before after put cqe',style ) + AR_tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style ) + AR_tmSheet.write(1+cnt*30, 20,'occl rank0 time',style ) + AR_tmSheet.write(1+cnt*30, 21,'nccl kern ori',style ) + AR_tmSheet.write(1+cnt*30, 27,'before after get sqe ori',style ) + AR_tmSheet.write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style ) + AR_tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style ) + AR_tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style ) y = 64 for i in range(0,25): - tmSheet.write(2+i+cnt*30,12,y,style) + AR_tmSheet.write(2+i+cnt*30,12,y,style) y = y*2 with open(OFCCL_OUTPUT_QE_PATH) as f3: @@ -260,64 +329,64 @@ content4 = f4.read() times4 = content4.split() for i in range(0,25): - tmSheet.write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) - tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style ) - tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) - tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style) - tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style) - tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style) - tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style) - tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style) + AR_tmSheet.write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) + AR_tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style ) + AR_tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) + AR_tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style) + AR_tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style) + AR_tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style) + AR_tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style) + AR_tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style) for j in range(0,5): - tmSheet.write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style) - tmSheet.write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style) - tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style) - tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style) + AR_tmSheet.write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style) + AR_tmSheet.write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style) + AR_tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style) + AR_tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style) - # cntsheet - cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + # AR_cntSheet + AR_cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) axis_y = buffer_sizes for a in range(0,25): - cntSheet.write(2+a+cnt*30,0,axis_y[a],style) + AR_cntSheet.write(2+a+cnt*30,0,axis_y[a],style) - cntSheet.write(1+cnt*30,1,"totalCtxSaveCnt_avg",style) - cntSheet.write(1+cnt*30,2,"totalCtxLoadCnt_avg",style) - cntSheet.write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style) - cntSheet.write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style) - cntSheet.write(1+cnt*30,6,"totalCtxSaveCnt",style) - cntSheet.write(1+cnt*30,24,"totalCtxLoadCnt",style) - cntSheet.write(1+cnt*30,42,"totalProgressed7SwithchCnt",style) - cntSheet.write(1+cnt*30,60,"totalUnprogressedQuitCnt",style) + AR_cntSheet.write(1+cnt*30,1,"totalCtxSaveCnt_avg",style) + AR_cntSheet.write(1+cnt*30,2,"totalCtxLoadCnt_avg",style) + AR_cntSheet.write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style) + AR_cntSheet.write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style) + AR_cntSheet.write(1+cnt*30,6,"totalCtxSaveCnt",style) + AR_cntSheet.write(1+cnt*30,24,"totalCtxLoadCnt",style) + AR_cntSheet.write(1+cnt*30,42,"totalProgressed7SwithchCnt",style) + AR_cntSheet.write(1+cnt*30,60,"totalUnprogressedQuitCnt",style) with open(OFCCL_OUTPUT_TOTALCNT_PATH) as f: line = f.readline() # save for i in range(0,25): numbers = line.split() - cntSheet.write(i+2+cnt*30,1,numbers[0]) + AR_cntSheet.write(i+2+cnt*30,1,numbers[0]) for j in range(1,len(numbers)): - cntSheet.write(i+2+cnt*30,5+j,numbers[j]) + AR_cntSheet.write(i+2+cnt*30,5+j,numbers[j]) line = f.readline() # load for i in range(0,25): numbers = line.split() - cntSheet.write(i+2+cnt*30,2,numbers[0]) + AR_cntSheet.write(i+2+cnt*30,2,numbers[0]) for j in range(1,len(numbers)): - cntSheet.write(i+2+cnt*30,23+j,numbers[j]) + AR_cntSheet.write(i+2+cnt*30,23+j,numbers[j]) line = f.readline() # totalProgressed7SwithchCnt for i in range(0,25): numbers = line.split() - cntSheet.write(i+2+cnt*30,3,numbers[0]) + AR_cntSheet.write(i+2+cnt*30,3,numbers[0]) for j in range(1,len(numbers)): - cntSheet.write(i+2+cnt*30,41+j,numbers[j]) + AR_cntSheet.write(i+2+cnt*30,41+j,numbers[j]) line = f.readline() # totalUnprogressedQuitCnt for i in range(0,25): numbers = line.split() - cntSheet.write(i+2+cnt*30,4,numbers[0]) + AR_cntSheet.write(i+2+cnt*30,4,numbers[0]) for j in range(1,len(numbers)): - cntSheet.write(i+2+cnt*30,59+j,numbers[j]) + AR_cntSheet.write(i+2+cnt*30,59+j,numbers[j]) line = f.readline() From d81c2572d9fbb4b84f6be1dd18f581fbb5bed273 Mon Sep 17 00:00:00 2001 From: novaCoder-zrk Date: Fri, 20 Jan 2023 09:44:42 +0000 Subject: [PATCH 108/109] =?UTF-8?q?=E6=B5=8B=E8=AF=95=20=E4=BA=94=E7=A7=8D?= =?UTF-8?q?=E6=93=8D=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 527 ++++++++++--------- test_scripts/nccl/static_nccl.cpp | 51 +- test_scripts/nccl/static_time.cpp | 44 +- test_scripts/ofccl/static_ofccl_bw_order.cpp | 46 ++ test_scripts/ofccl/static_ofccl_tm_order.cpp | 35 ++ 5 files changed, 416 insertions(+), 287 deletions(-) create mode 100644 test_scripts/ofccl/static_ofccl_bw_order.cpp create mode 100644 test_scripts/ofccl/static_ofccl_tm_order.cpp diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 566ff3e..26536d5 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -19,15 +19,15 @@ os.environ['DEV_TRY_ROUND'] = "10" # 设置超参数 -runNcclTest = True # 运行nccl测试,仅输出原始结果 -staticNccl = True # 运行统计,输出中间结果 -collectNcclResult = False # 收集nccl测试结果,写入xls +runNcclTest = False # 运行nccl测试,仅输出原始结果 +staticNccl = False # 运行统计,输出中间结果 +collectNcclResult = True# 收集nccl测试结果,写入xls -runOfcclTest = False# 运行ofccl测试 -staticOfccl = False # 运行统计,输出中间结果 -staticOfcclExtral = False # 对ofccl的额外输出进行统计 -collectOfcclResult = False# 收集ofccl测试结果,写入xls +runOfcclTest = True# 运行ofccl测试 +staticOfccl = True # 运行统计,输出中间结果 +staticOfcclExtral = True# 对ofccl的额外输出进行统计 +collectOfcclResult = True# 收集ofccl测试结果,写入xls buffer_sizes = ["64", "128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"] @@ -53,7 +53,8 @@ collectNcclResult = False # 收集nccl测试结果,写入xls ncards = [2] # buffer_sizes = ["64", "128", "256", "512", "1K"] - +NCCL_TIER=[1,2,3] +OFCCL_ITER=[1,2,3,4,5,6] resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+str(m)+"n"+str(n)+"w"+str(w)+".xls" # static @@ -64,28 +65,44 @@ os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out") os.system("g++ ./ofccl/static_ofccl_QE_ori.cpp -o ./ofccl/static_ofccl_QE_ori.out") os.system("g++ ./ofccl/static_ofccl_totalCnt.cpp -o ./ofccl/static_ofccl_totalCnt.out") - +os.system("g++ ./ofccl/static_ofccl_bw_order.cpp -o ./ofccl/static_ofccl_bw_order.out ") +os.system("g++ ./ofccl/static_ofccl_tm_order.cpp -o ./ofccl/static_ofccl_tm_order.out ") +AR = {} +AG = {} +B = {} +R = {} +RS = {} table = xlwt.Workbook() AR['bwSheet'] = table.add_sheet('allReduce_bw') -AR['tmShee'] = table.add_sheet('allReduce_time') +AR['tmSheet'] = table.add_sheet('allReduce_time') AR['cntSheet'] = table.add_sheet('allReduce_totalCnt') +AR['run'] = "../build/all_reduce_perf" +AR['runOfccl'] = "../build/ofccl_all_reduce_perf" AG['bwSheet'] = table.add_sheet('allGather_bw') AG['tmSheet'] = table.add_sheet('allGather_time') AG['cntSheet'] = table.add_sheet('allGather_totalCnt') +AG['run'] = "../build/all_gather_perf" +AG['runOfccl'] = "../build/ofccl_all_gather_perf" B['bwSheet'] = table.add_sheet('broadcast_bw') B['tmSheet'] = table.add_sheet('broadcast_time') B['cntSheet'] = table.add_sheet('broadcast_totalCnt') +B['run'] = "../build/broadcast_perf" +B['runOfccl']="../build/ofccl_broadcast_perf" R['bwSheet'] = table.add_sheet('reduce_bw') R['tmSheet'] = table.add_sheet('reduce_time') R['cntSheet'] = table.add_sheet('reduce_totalCnt') +R['run'] = "../build/reduce_perf" +R['runOfccl']= "../build/ofccl_reduce_perf" RS['bwSheet'] = table.add_sheet('reduceScatter_bw') RS['tmSheet'] = table.add_sheet('reduceScatter_time') RS['cntSheet'] = table.add_sheet('reduceScatter_totalCnt') +RS['run'] = "../build/reduce_scatter_perf" +RS['runOfccl'] = "../build/ofccl_reduce_scatter_perf" # 列宽 # for i in range(30): # AR['bwSheet'].col(i).width = 13 * 256 @@ -125,100 +142,73 @@ os.system("echo $(date +%F%n%T)>>"+op['nccl_bw_path']) os.system("echo $(date +%F%n%T)>>"+op['nccl_time_path']) - os.system("echo $(date +%F%n%T)>>"+NCCL_AG_BW_PATH) - os.system("echo $(date +%F%n%T)>>"+NCCL_AG_TIME_PATH) - os.system("echo $(date +%F%n%T)>>"+NCCL_B_BW_PATH) - os.system("echo $(date +%F%n%T)>>"+NCCL_B_TIME_PATH) - - os.system("echo $(date +%F%n%T)>>"+NCCL_R_BW_PATH) - os.system("echo $(date +%F%n%T)>>"+NCCL_R_TIME_PATH) + for iter in NCCL_TIER: + # raw data + AR['nccl_rawData'] = NCCL_RES_DIR+"/nccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + AG['nccl_rawData'] = NCCL_RES_DIR+"/nccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + B['nccl_rawData'] = NCCL_RES_DIR+"/nccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + R['nccl_rawData'] = NCCL_RES_DIR+"/nccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + RS['nccl_rawData'] = NCCL_RES_DIR+"/nccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" - os.system("echo $(date +%F%n%T)>>"+NCCL_RS_BW_PATH) - os.system("echo $(date +%F%n%T)>>"+NCCL_RS_TIME_PATH) + if runNcclTest: + for op in [AR,AG,B,R,RS]: + os.system("echo $(date +%F%n%T)>> "+op['nccl_rawData']) + for a in buffer_sizes: + os.system(op['run']+" -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ op['nccl_rawData']) - for iter in [1,2,3]: - # raw data - NCCL_AR = NCCL_RES_DIR+"/nccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" - NCCL_AG = NCCL_RES_DIR+"/nccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" - NCCL_B = NCCL_RES_DIR+"/nccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" - NCCL_R = NCCL_RES_DIR+"/nccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" - NCCL_RS = NCCL_RES_DIR+"/nccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + if staticNccl: + for op in [AR,AG,B,R,RS]: + os.system("./nccl/static_nccl.out " +op['nccl_rawData'] +" " +op['nccl_bw_path']) + os.system("./nccl/static_time.out " +op['nccl_rawData'] +" " +op['nccl_time_path']) - if runNcclTest: - os.system("echo $(date +%F%n%T)>> "+NCCL_AR) - os.system("echo $(date +%F%n%T)>> "+NCCL_AG) - os.system("echo $(date +%F%n%T)>> "+NCCL_B) - os.system("echo $(date +%F%n%T)>> "+NCCL_R) - os.system("echo $(date +%F%n%T)>> "+NCCL_RS) - - for a in buffer_sizes: - os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_AR) - os.system("../build/all_gather_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_AG) - os.system("../build/broadcast_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_B) - os.system("../build/reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_R) - os.system("../build/reduce_scatter_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RS) - - if staticNccl: - os.system("./nccl/static_nccl.out " +NCCL_AR+" " +NCCL_AR_BW_PATH+" "+str(MY_NUM_DEV)) - os.system("./nccl/static_time.out " +NCCL_AR+" " +NCCL_AR_TIME_PATH+" "+str(MY_NUM_DEV)) - - os.system("./nccl/static_nccl.out " +NCCL_AG+" " +NCCL_AG_BW_PATH+" "+str(MY_NUM_DEV)) - os.system("./nccl/static_time.out " +NCCL_AG+" " +NCCL_AG_TIME_PATH+" "+str(MY_NUM_DEV)) - - os.system("./nccl/static_nccl.out " +NCCL_B+" " +NCCL_B_BW_PATH+" "+str(MY_NUM_DEV)) - os.system("./nccl/static_time.out " +NCCL_B+" " +NCCL_B_TIME_PATH+" "+str(MY_NUM_DEV)) - - os.system("./nccl/static_nccl.out " +NCCL_R+" " +NCCL_R_BW_PATH+" "+str(MY_NUM_DEV)) - os.system("./nccl/static_time.out " +NCCL_R+" " +NCCL_R_TIME_PATH+" "+str(MY_NUM_DEV)) - - os.system("./nccl/static_nccl.out " +NCCL_RS+" " +NCCL_RS_BW_PATH+" "+str(MY_NUM_DEV)) - os.system("./nccl/static_time.out " +NCCL_RS+" " +NCCL_RS_TIME_PATH+" "+str(MY_NUM_DEV)) + - if collectNcclResult == True : - # bus - AR_bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) - - with open(NCCL_AR_BW_PATH) as f: - content = f.read() - bw = content.split() - - axis_y = buffer_sizes - for a in range(0,25): - AR_bwSheet.write(2+a+cnt*30,0,axis_y[a],style) - # - for k in [0,1,2]: - AR_bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k),style) - for i in range(0,25): - AR_bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2],style) - - AR_bwSheet.write(1+cnt*30,12+k,'nccl-busbw'+str(k),style) + if collectNcclResult : + for op in [AR,AG,B,R,RS]: + # bus + op['bwSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + + with open(op['nccl_bw_path']) as f: + content = f.read() + bw = content.split() + + axis_y = buffer_sizes + for a in range(0,25): + op['bwSheet'].write(2+a+cnt*30,0,axis_y[a],style) + # + for k in [0,1,2]: + op['bwSheet'].write(1+cnt*30,1+k,'nccl-algbw'+str(k),style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30,1+k,bw[i+k*50+2],style) + + op['bwSheet'].write(1+cnt*30,12+k,'nccl-busbw'+str(k),style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style) + # avg + op['bwSheet'].write(1+cnt*30, 4, 'avg-algbw',style) + op['bwSheet'].write(1+cnt*30, 15, 'avg-busbw',style) for i in range(0,25): - AR_bwSheet.write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style) - # avg - AR_bwSheet.write(1+cnt*30, 4, 'avg-algbw',style) - AR_bwSheet.write(1+cnt*30, 15, 'avg-busbw',style) - for i in range(0,25): - AR_bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style ) - AR_bwSheet.write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) - - # time - with open(NCCL_AR_TIME_PATH) as f2: - content2 = f2.read() - times = content2.split() - - AR_tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) - for a in range(0,25): - AR_tmSheet.write(2+a+cnt*30,0,axis_y[a],style) - for k in [0,1,2]: - AR_tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k),style) + op['bwSheet'].write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style ) + op['bwSheet'].write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) + + # time + with open(op['nccl_time_path']) as f2: + content2 = f2.read() + times = content2.split() + + op['tmSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + for a in range(0,25): + op['tmSheet'].write(2+a+cnt*30,0,axis_y[a],style) + for k in [0,1,2]: + op['tmSheet'].write(1+cnt*30,1+k,'nccl-'+str(k),style) + for i in range(0,25): + op['tmSheet'].write(2+i+cnt*30,1+k,times[i+k*25+2],style) + # avg + op['tmSheet'].write(1+cnt*30, 4, 'avg-nccl',style) for i in range(0,25): - AR_tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2],style) - # avg - AR_tmSheet.write(1+cnt*30, 4, 'avg-nccl',style) - for i in range(0,25): - AR_tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style) + op['tmSheet'].write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style) #OFCCL @@ -227,167 +217,222 @@ if not os.path.exists(OFCCL_RES_DIR): os.makedirs(OFCCL_RES_DIR) # 统计结果 - OFCCL_OUTPUT_BW_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards.txt" - OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt" - OFCCL_OUTPUT_QE_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE.txt" - OFCCL_OUTPUT_QE_ORI_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" - OFCCL_OUTPUT_TOTALCNT_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" - - if staticOfccl == True: - os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH) - os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_TIME_PATH) + AR['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards.txt" + AR['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_order_"+str(MY_NUM_DEV)+"cards.txt" + AR['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_time.txt" + AR['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_order_"+str(MY_NUM_DEV)+"cards_time.txt" + AR['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_QE.txt" + AR['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + AR['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + AG['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards.txt" + AG['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_order_"+str(MY_NUM_DEV)+"cards.txt" + AG['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_time.txt" + AG['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_order_"+str(MY_NUM_DEV)+"cards_time.txt" + AG['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_QE.txt" + AG['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + AG['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + B['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards.txt" + B['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_order_"+str(MY_NUM_DEV)+"cards.txt" + B['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_time.txt" + B['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_order_"+str(MY_NUM_DEV)+"cards_time.txt" + B['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_QE.txt" + B['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + B['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + R['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards.txt" + R['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_order_"+str(MY_NUM_DEV)+"cards.txt" + R['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_time.txt" + R['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_order_"+str(MY_NUM_DEV)+"cards_time.txt" + R['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_QE.txt" + R['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + R['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + RS['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards.txt" + RS['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_order_"+str(MY_NUM_DEV)+"cards.txt" + RS['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_time.txt" + RS['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_order_"+str(MY_NUM_DEV)+"cards_time.txt" + RS['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_QE.txt" + RS['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + RS['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + if staticOfccl: + for op in [AR,AG,B,R,RS]: + os.system("echo $(date +%F%n%T)>>"+op['ofccl_bw_path']) + os.system("echo $(date +%F%n%T)>>"+op['ofccl_tm_path']) + os.system("echo $(date +%F%n%T)>>"+op['ofccl_bw_order_path']) + os.system("echo $(date +%F%n%T)>>"+op['ofccl_tm_order_path']) if staticOfcclExtral: - os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_PATH) - os.system("echo $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_ORI_PATH) + for op in [AR,AG,B,R,RS]: + os.system("echo $(date +%F%n%T)>>"+op['ofccl_qe_path']) + os.system("echo $(date +%F%n%T)>>"+op['ofccl_qeOri_path']) + - for iter in [1,2,3]: - OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt" + for iter in OFCCL_ITER: + # raw data + AR['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + AG['ofccl_rawData'] =OFCCL_RES_DIR+"/ofccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + B['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + R['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + RS['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + if runOfcclTest: - os.system("echo $(date +%F%n%T)>> "+OFCCL_RES_PATH) - for a in buffer_sizes: - os.system("../build/ofccl_all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ OFCCL_RES_PATH) + for op in [AR,AG,B,R,RS]: + os.system("echo $(date +%F%n%T)>> "+op['ofccl_rawData']) + for a in buffer_sizes: + os.system(op['runOfccl']+" -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ op['ofccl_rawData']) if staticOfccl: - os.system("./ofccl/static_ofccl_bw.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH) - os.system("./ofccl/static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH) + for op in [AR,AG,B,R,RS]: + os.system("./ofccl/static_ofccl_bw.out " +op['ofccl_rawData']+" " +op['ofccl_bw_path']) + os.system("./ofccl/static_ofccl_time.out " +op['ofccl_rawData']+" " + op['ofccl_tm_path']) if staticOfcclExtral: - os.system("./ofccl/static_ofccl_QE.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_PATH) - os.system("./ofccl/static_ofccl_QE_ori.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_ORI_PATH) - os.system("./ofccl/static_ofccl_totalCnt.out "+OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TOTALCNT_PATH) + for op in [AR,AG,B,R,RS]: + os.system("./ofccl/static_ofccl_QE.out " +op['ofccl_rawData']+" " + op['ofccl_qe_path']) + os.system("./ofccl/static_ofccl_QE_ori.out " +op['ofccl_rawData']+" " + op['ofccl_qeOri_path']) + os.system("./ofccl/static_ofccl_totalCnt.out "+op['ofccl_rawData']+" " + op['ofccl_totalCnt_path']) + if staticOfccl: + for op in [AR,AG,B,R,RS]: + os.system("./ofccl/static_ofccl_bw_order.out "+op['ofccl_bw_path']+" "+op['ofccl_bw_order_path']+" "+ str(len(OFCCL_ITER))) + os.system("./ofccl/static_ofccl_tm_order.out "+op['ofccl_tm_path']+" "+op['ofccl_tm_order_path']+" "+ str(len(OFCCL_ITER))) if collectOfcclResult == True: - - with open(OFCCL_OUTPUT_BW_PATH) as f2: - content2 = f2.read() - bw = content2.split() - #bus - for k in [0,1,2]: - AR_bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style) + #bus width + for op in [AR,AG,B,R,RS]: + with open(op['ofccl_bw_order_path']) as f2: + content2 = f2.read() + bw = content2.split() + + for k in [0,1,2]: + op['bwSheet'].write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30,5+k,bw[i+k*50+2],style) + + op['bwSheet'].write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style) + # avg + op['bwSheet'].write(1+cnt*30,8, 'avg-algbw',style) + op['bwSheet'].write(1+cnt*30, 19, 'avg-busbw',style) for i in range(0,25): - AR_bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2],style) - - AR_bwSheet.write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style) + op['bwSheet'].write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) + op['bwSheet'].write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) + + # time + with open(op['ofccl_tm_order_path']) as f2: + content2 = f2.read() + times = content2.split() + + for k in [0,1,2]: + op['tmSheet'].write(1+cnt*30,5+k,'ofccl-'+str(k),style) + for i in range(0,25): + op['tmSheet'].write(2+i+cnt*30,5+k,times[i+k*25+2],style) + # avg + op['tmSheet'].write(1+cnt*30, 4+4, 'avg-ofccl',style) for i in range(0,25): - AR_bwSheet.write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style) - # avg - AR_bwSheet.write(1+cnt*30,8, 'avg-algbw',style) - AR_bwSheet.write(1+cnt*30, 19, 'avg-busbw',style) - for i in range(0,25): - AR_bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) - AR_bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) - - # time - with open(OFCCL_OUTPUT_TIME_PATH) as f2: - content2 = f2.read() - times = content2.split() - - for k in [0,1,2]: - AR_tmSheet.write(1+cnt*30,5+k,'ofccl-'+str(k),style) - for i in range(0,25): - AR_tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2],style) - # avg - AR_tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style) - for i in range(0,25): - AR_tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) + op['tmSheet'].write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) if collectNcclResult and collectOfcclResult: - AR_bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style) - AR_bwSheet.write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style) - AR_tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style) - AR_tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style) - for i in range(0,25): - AR_bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style) - AR_bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style) - AR_tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style ) - AR_tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style ) + for op in [AR,AG,B,R,RS]: + op['bwSheet'].write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style) + op['bwSheet'].write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style) + op['tmSheet'].write(1+cnt*30, 9, 'ofccl-nccl',style) + op['tmSheet'].write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style) + op['bwSheet'].write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style) + op['tmSheet'].write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style ) + op['tmSheet'].write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style ) # time 各个列的标题 if staticOfcclExtral: - AR_tmSheet.write(1+cnt*30, 13,'nccl IO',style ) - AR_tmSheet.write(1+cnt*30, 14,'nccl kern',style ) - AR_tmSheet.write(1+cnt*30, 15,'ofccl-nccl kern',style ) - AR_tmSheet.write(1+cnt*30, 16,'before after get sqe',style ) - AR_tmSheet.write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style ) - AR_tmSheet.write(1+cnt*30, 18,'before after put cqe',style ) - AR_tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style ) - AR_tmSheet.write(1+cnt*30, 20,'occl rank0 time',style ) - AR_tmSheet.write(1+cnt*30, 21,'nccl kern ori',style ) - AR_tmSheet.write(1+cnt*30, 27,'before after get sqe ori',style ) - AR_tmSheet.write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style ) - AR_tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style ) - AR_tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style ) - - y = 64 - for i in range(0,25): - AR_tmSheet.write(2+i+cnt*30,12,y,style) - y = y*2 - - with open(OFCCL_OUTPUT_QE_PATH) as f3: - content3 = f3.read() - times = content3.split() - with open(OFCCL_OUTPUT_QE_ORI_PATH) as f4: - content4 = f4.read() - times4 = content4.split() - for i in range(0,25): - AR_tmSheet.write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) - AR_tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style ) - AR_tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) - AR_tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style) - AR_tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style) - AR_tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style) - AR_tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style) - AR_tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style) - for j in range(0,5): - AR_tmSheet.write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style) - AR_tmSheet.write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style) - AR_tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style) - AR_tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style) - - # AR_cntSheet - AR_cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) - axis_y = buffer_sizes - for a in range(0,25): - AR_cntSheet.write(2+a+cnt*30,0,axis_y[a],style) - - AR_cntSheet.write(1+cnt*30,1,"totalCtxSaveCnt_avg",style) - AR_cntSheet.write(1+cnt*30,2,"totalCtxLoadCnt_avg",style) - AR_cntSheet.write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style) - AR_cntSheet.write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style) - AR_cntSheet.write(1+cnt*30,6,"totalCtxSaveCnt",style) - AR_cntSheet.write(1+cnt*30,24,"totalCtxLoadCnt",style) - AR_cntSheet.write(1+cnt*30,42,"totalProgressed7SwithchCnt",style) - AR_cntSheet.write(1+cnt*30,60,"totalUnprogressedQuitCnt",style) - - with open(OFCCL_OUTPUT_TOTALCNT_PATH) as f: - line = f.readline() - # save - for i in range(0,25): - numbers = line.split() - AR_cntSheet.write(i+2+cnt*30,1,numbers[0]) - for j in range(1,len(numbers)): - AR_cntSheet.write(i+2+cnt*30,5+j,numbers[j]) - line = f.readline() - # load - for i in range(0,25): - numbers = line.split() - AR_cntSheet.write(i+2+cnt*30,2,numbers[0]) - for j in range(1,len(numbers)): - AR_cntSheet.write(i+2+cnt*30,23+j,numbers[j]) - line = f.readline() - # totalProgressed7SwithchCnt - for i in range(0,25): - numbers = line.split() - AR_cntSheet.write(i+2+cnt*30,3,numbers[0]) - for j in range(1,len(numbers)): - AR_cntSheet.write(i+2+cnt*30,41+j,numbers[j]) - line = f.readline() - # totalUnprogressedQuitCnt - for i in range(0,25): - numbers = line.split() - AR_cntSheet.write(i+2+cnt*30,4,numbers[0]) - for j in range(1,len(numbers)): - AR_cntSheet.write(i+2+cnt*30,59+j,numbers[j]) + for op in [AR,AG,B,R,RS]: + op['tmSheet'].write(1+cnt*30, 13,'nccl IO',style ) + op['tmSheet'].write(1+cnt*30, 14,'nccl kern',style ) + op['tmSheet'].write(1+cnt*30, 15,'ofccl-nccl kern',style ) + op['tmSheet'].write(1+cnt*30, 16,'before after get sqe',style ) + op['tmSheet'].write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style ) + op['tmSheet'].write(1+cnt*30, 18,'before after put cqe',style ) + op['tmSheet'].write(1+cnt*30, 19,'beforeSqe TO afterCqe',style ) + op['tmSheet'].write(1+cnt*30, 20,'occl rank0 time',style ) + op['tmSheet'].write(1+cnt*30, 21,'nccl kern ori',style ) + op['tmSheet'].write(1+cnt*30, 27,'before after get sqe ori',style ) + op['tmSheet'].write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style ) + op['tmSheet'].write(1+cnt*30, 39,'before after put cqe ori',style ) + op['tmSheet'].write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style ) + + y = 64 + for i in range(0,25): + op['tmSheet'].write(2+i+cnt*30,12,y,style) + y = y*2 + + with open(op['ofccl_qe_path']) as f3: + content3 = f3.read() + times = content3.split() + with open(op['ofccl_qeOri_path']) as f4: + content4 = f4.read() + times4 = content4.split() + for i in range(0,25): + op['tmSheet'].write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) + op['tmSheet'].write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style ) + op['tmSheet'].write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) + op['tmSheet'].write(2+cnt*30+i,16,times[2+125*cnt+i],style) + op['tmSheet'].write(2+cnt*30+i,17,times[2+125*cnt+25+i],style) + op['tmSheet'].write(2+cnt*30+i,18,times[2+125*cnt+50+i],style) + op['tmSheet'].write(2+cnt*30+i,19,times[2+125*cnt+75+i],style) + op['tmSheet'].write(2+cnt*30+i,20,times[2+125*cnt+100+i],style) + for j in range(0,5): + op['tmSheet'].write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style) + op['tmSheet'].write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style) + op['tmSheet'].write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style) + op['tmSheet'].write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style) + + # cntSheet + op['cntSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + axis_y = buffer_sizes + for a in range(0,25): + op['cntSheet'].write(2+a+cnt*30,0,axis_y[a],style) + + op['cntSheet'].write(1+cnt*30,1,"totalCtxSaveCnt_avg",style) + op['cntSheet'].write(1+cnt*30,2,"totalCtxLoadCnt_avg",style) + op['cntSheet'].write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style) + op['cntSheet'].write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style) + op['cntSheet'].write(1+cnt*30,6,"totalCtxSaveCnt",style) + op['cntSheet'].write(1+cnt*30,24,"totalCtxLoadCnt",style) + op['cntSheet'].write(1+cnt*30,42,"totalProgressed7SwithchCnt",style) + op['cntSheet'].write(1+cnt*30,60,"totalUnprogressedQuitCnt",style) + + with open(op['ofccl_totalCnt_path']) as f: line = f.readline() + # save + for i in range(0,25): + numbers = line.split() + op['cntSheet'].write(i+2+cnt*30,1,numbers[0]) + for j in range(1,len(numbers)): + op['cntSheet'].write(i+2+cnt*30,5+j,numbers[j]) + line = f.readline() + # load + for i in range(0,25): + numbers = line.split() + op['cntSheet'].write(i+2+cnt*30,2,numbers[0]) + for j in range(1,len(numbers)): + op['cntSheet'].write(i+2+cnt*30,23+j,numbers[j]) + line = f.readline() + # totalProgressed7SwithchCnt + for i in range(0,25): + numbers = line.split() + op['cntSheet'].write(i+2+cnt*30,3,numbers[0]) + for j in range(1,len(numbers)): + op['cntSheet'].write(i+2+cnt*30,41+j,numbers[j]) + line = f.readline() + # totalUnprogressedQuitCnt + for i in range(0,25): + numbers = line.split() + op['cntSheet'].write(i+2+cnt*30,4,numbers[0]) + for j in range(1,len(numbers)): + op['cntSheet'].write(i+2+cnt*30,59+j,numbers[j]) + line = f.readline() diff --git a/test_scripts/nccl/static_nccl.cpp b/test_scripts/nccl/static_nccl.cpp index f12519a..911fd0c 100644 --- a/test_scripts/nccl/static_nccl.cpp +++ b/test_scripts/nccl/static_nccl.cpp @@ -6,36 +6,37 @@ int main(int argc,char* argv[]){ freopen(argv[1],"r",stdin); freopen(argv[2],"a",stdout); - int ranks = *(argv[3]) - '0'; - string str; - stringstream ss; + string inputLine; vector a; vector b; - string line; - // time - getline(cin,line); + string ss="bandwidth"; + string str = "N/A"; + int cnt = 0; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; - for(int t =0;t < 25;t++){ - for(int i = 0;i < (7+ranks);i++) - getline(cin,line); - - for(int i =0;i < 6;i++) - cin >> str; - - a.push_back(str); - cin >> str; - b.push_back(str); - - - for(int i = 0;i < 4;i++) - getline(cin,line); + stringstream line; + line << inputLine; + string tmp; + stack ss; + while(line >> tmp){ + ss.push(tmp); + } + ss.pop(); + b.push_back(ss.top()); + ss.pop(); + a.push_back(ss.top()); + if(++cnt == 25) + break; } - for(int i=0;i a; vector b; - string line; - // time - getline(cin,line); + string ss="bandwidth"; + string str = "N/A"; + int cnt = 0; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; - for(int t =0;t < 25;t++){ - for(int i = 0;i < (7+ranks);i++) - getline(cin,line); - - for(int i =0;i < 5;i++) - cin >> str; - - a.push_back(str); - - for(int i = 0;i < 4;i++) - getline(cin,line); + stringstream line; + line << inputLine; + string tmp; + stack ss; + while(line >> tmp){ + ss.push(tmp); + } + ss.pop(); + ss.pop(); + ss.pop(); + a.push_back(ss.top()); + if(++cnt == 25) + break; } - for(int i=0;i,less>> a(25,priority_queue,less>()); + vector,less>> b(25,priority_queue,less>()); + + + for(int i = 0;i < num;i++){ + for(int j = 0;j < 25;j++){ + double tmp; + cin>>tmp; + a[j].push(tmp); + } + for(int j = 0;j < 25;j++){ + double tmp; + cin>>tmp; + b[j].push(tmp); + } + } + + for(int i = 0;i < num;i++){ + for(int j = 0;j < 25;j++){ + double tmp; + tmp = a[j].top();a[j].pop(); + cout<,greater>> a(25,priority_queue,greater>()); + + for(int i = 0;i < num;i++){ + for(int j = 0;j < 25;j++){ + double tmp; + cin>>tmp; + a[j].push(tmp); + } + + } + + for(int i = 0;i < num;i++){ + for(int j = 0;j < 25;j++){ + double tmp; + tmp = a[j].top();a[j].pop(); + cout< Date: Fri, 27 Jan 2023 03:16:15 +0000 Subject: [PATCH 109/109] =?UTF-8?q?=E5=8E=BB=E9=99=A4=20xlrd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test_scripts/auto_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py index 26536d5..1f8c9f1 100644 --- a/test_scripts/auto_test.py +++ b/test_scripts/auto_test.py @@ -1,5 +1,5 @@ import os -import xlrd + import xlwt # 设置字体大小 style = xlwt.XFStyle() @@ -19,8 +19,8 @@ os.environ['DEV_TRY_ROUND'] = "10" # 设置超参数 -runNcclTest = False # 运行nccl测试,仅输出原始结果 -staticNccl = False # 运行统计,输出中间结果 +runNcclTest = True # 运行nccl测试,仅输出原始结果 +staticNccl = True # 运行统计,输出中间结果 collectNcclResult = True# 收集nccl测试结果,写入xls