diff --git a/.gitignore b/.gitignore index a0a013e..99f99d6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,11 @@ # # See LICENCE.txt for license information /build + +.clangd + +.vscode + +*result*/ +*.xls +*.out \ No newline at end of file diff --git a/README.md b/README.md index bff6433..1c3c505 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ All tests support the same set of arguments : * `-n,--iters ` number of iterations. Default : 20. * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. + * `-M,--multi_iters ` number of operations with seprate ncclComm in each iteration. Default : 1. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. diff --git a/nccl_test.sh b/nccl_test.sh new file mode 100644 index 0000000..1435e51 --- /dev/null +++ b/nccl_test.sh @@ -0,0 +1,87 @@ +clear + +export MY_NUM_DEV=$1 + +cd /home/panlichen/work2/nccl-tests +export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +if [ -z $BINARY ];then + BINARY="DEBUG" + # BINARY="MS" + # BINARY="PERF" +fi + +FUNC=$2 + +if [ "$FUNC" == "AR" ]; then + target="./build/all_reduce_perf" +elif [ "$FUNC" == "AG" ]; then + target="./build/all_gather_perf" +elif [ "$FUNC" == "RS" ]; then + target="./build/reduce_scatter_perf" +elif [ "$FUNC" == "R" ]; then + target="./build/reduce_perf" +elif [ "$FUNC" == "B" ]; then + target="./build/broadcast_perf" +fi + + +if [ "$BINARY" == "DEBUG" ];then + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + export NITER=5 + export NBYTES=64M + export WARMITER=2 + export MITER=1 + export CHECK=0 +elif [ "$BINARY" == "PERF" ];then + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + export NITER=4 + export NBYTES=8K + export WARMITER=2 + export MITER=4 + export CHECK=0 +elif [ "$BINARY" == "MS" ];then + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + # export NITER=200 + # export SHOW_ALL_PREPARED_COLL=1 + # export WARMITER=0 + # export NBYTES=8K + # export MITER=4 +fi + +export NSYS_FILE="nccl" +export NCU_FILE="nccl" + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" + # RUN_TYPE="NCU" +fi + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER" +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="cuda-gdb $target" + # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 +elif [ "$RUN_TYPE" == "NSYS" ];then + cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER" +elif [ "$RUN_TYPE" == "NCU" ];then + # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER" + cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER" +fi + +echo cmd=$cmd +$cmd #> /home/panlichen/work2/ofccl/log/ofccl-2ms-coll-master.log + diff --git a/ofccl_test.sh b/ofccl_test.sh new file mode 100644 index 0000000..1e62664 --- /dev/null +++ b/ofccl_test.sh @@ -0,0 +1,166 @@ +clear + +export MY_NUM_DEV=$1 + +export DEBUG_CC=1 +export DEBUG_ENQ=1 + +unset DEBUG_CC +unset DEBUG_ENQ + +export DEBUG_NT=1 +unset DEBUG_NT + +cd /home/panlichen/work2/nccl-tests +export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +export CHECK=0 +export SHOW_ALL_PREPARED_COLL=0 + +export TRAVERSE_TIMES=10 +export TOLERANT_UNPROGRESSED_CNT=10000 +export BASE_CTX_SWITCH_THRESHOLD=80 +export BOUNS_SWITCH_4_PROCESSED_COLL=0 +export DEV_TRY_ROUND=10 +export CHECK_REMAINING_SQE_INTERVAL=10000 +export DEBUG_FILE="/home/panlichen/work2/ofccl/log/oneflow_cpu_rank_" + +rm -rf /home/panlichen/work2/ofccl/log +mkdir -p /home/panlichen/work2/ofccl/log + +# export ENABLE_VQ=1 # volunteer quit +# export TOLERANT_FAIL_CHECK_SQ_CNT=5000 +# export CNT_BEFORE_QUIT=5 + +echo TRAVERSE_TIMES=$TRAVERSE_TIMES +echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT +echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD +echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL +echo DEV_TRY_ROUND=$DEV_TRY_ROUND +echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL +echo DEBUG_FILE=$DEBUG_FILE + +if [ ! -z $ENABLE_VQ ];then + echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT + echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT +fi + +FUNC=$2 +if [ -z $FUNC ]; then + FUNC="AR" +fi + +if [ "$FUNC" == "AR" ]; then + target="./build/ofccl_all_reduce_perf" +elif [ "$FUNC" == "AG" ]; then + target="./build/ofccl_all_gather_perf" +elif [ "$FUNC" == "RS" ]; then + target="./build/ofccl_reduce_scatter_perf" +elif [ "$FUNC" == "R" ]; then + target="./build/ofccl_reduce_perf" +elif [ "$FUNC" == "B" ]; then + target="./build/ofccl_broadcast_perf" +fi + +if [ -z $BINARY ];then + BINARY="DEBUG" + # BINARY="MS" + # BINARY="PERF" +fi + +if [ "$BINARY" == "DEBUG" ];then + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + if [ $MY_NUM_DEV = 2 ]; then + export CUDA_VISIBLE_DEVICES=4,5 + fi + export NITER=5 + export NBYTES=64M + export WARMITER=2 + export MITER=1 +elif [ "$BINARY" == "PERF" ];then + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + export NITER=8 + export NBYTES=8K + export WARMITER=2 + export MITER=1 +elif [ "$BINARY" == "MS" ];then + target="./build/ofccl_all_reduce_ms_perf" + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + export NITER=200 + export SHOW_ALL_PREPARED_COLL=1 + export WARMITER=0 + export NBYTES=8K + export MITER=4 + export CHECK=0 +fi + +export NSYS_FILE="ofccl" +export NCU_FILE="ofccl" + +if [ -z $RUN_TYPE ];then + RUN_TYPE="PURE" + # RUN_TYPE="GDB" + # RUN_TYPE="NSYS" + # RUN_TYPE="NCU" +fi + +# typedef enum { ncclInt8 = 0, ncclChar = 0, +# ncclUint8 = 1, +# ncclInt32 = 2, ncclInt = 2, +# ncclUint32 = 3, +# ncclInt64 = 4, +# ncclUint64 = 5, +# ncclFloat16 = 6, ncclHalf = 6, +# ncclFloat32 = 7, ncclFloat = 7, +# ncclFloat64 = 8, ncclDouble = 8, +# #if defined(__CUDA_BF16_TYPES_EXIST__) +# ncclBfloat16 = 9, +# ncclNumTypes = 10 +# #else +# ncclNumTypes = 9 +# #endif +# } ncclDataType_t; + +# 用这个: +# const char *test_typenames[ncclNumTypes] = {"int8", +# "uint8", +# "int32", +# "uint32", +# "int64", +# "uint64", +# "half", +# "float", +# "double" +# #if defined(__CUDA_BF16_TYPES_EXIST__) && \ +# NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) +# , +# "bfloat16" +# #endif +# }; + +if [ "$RUN_TYPE" == "PURE" ];then + cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" # -d half +elif [ "$RUN_TYPE" == "GDB" ];then + cmd="cuda-gdb $target" + # set args -b 64 -e 64 -f 2 -t 2 -g 1 -n 1 -w 0 -c 0 +elif [ "$RUN_TYPE" == "NSYS" ];then + cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" +elif [ "$RUN_TYPE" == "NCU" ];then + # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" + cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" +fi + +echo cmd=$cmd +$cmd #> /home/panlichen/work2/ofccl/log/ofccl.log + diff --git a/src/Makefile b/src/Makefile index 2a399db..5927cc2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,7 +7,7 @@ CUDA_HOME ?= /usr/local/cuda PREFIX ?= /usr/local VERBOSE ?= 0 -DEBUG ?= 0 +DEBUG_NT ?= 0 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -19,27 +19,39 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) # Better define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. -ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) -NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_80,code=sm_80 \ - -gencode=arch=compute_80,code=compute_80 +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) else -NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ - -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_70,code=compute_70 +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 LDFLAGS := -L${CUDA_LIB} -lcudart -lrt NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt -ifeq ($(DEBUG), 0) +ifeq ($(DEBUG_NT), 0) NVCUFLAGS += -O3 -g CXXFLAGS += -O3 -g else @@ -72,6 +84,8 @@ endif LIBRARIES += nccl NVLDFLAGS += $(LIBRARIES:%=-l%) +$(info CARDNAME $(NVCUFLAGS)) + DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) diff --git a/src/common.cu b/src/common.cu index 05f814d..fea29f0 100644 --- a/src/common.cu +++ b/src/common.cu @@ -590,7 +590,7 @@ testResult_t completeColl(struct threadArgs* args) { testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); - if (datacheck) { + if (datacheck) { // 这里的目的应该是让测带宽跑的coll也使用非0数据。 // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); } @@ -652,6 +652,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t double deltaSec = std::chrono::duration_cast>(delta).count(); deltaSec = deltaSec/(iters*agg_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + // int cudaDev; + // cudaGetDevice(&cudaDev); + // OFTEST_LOG(TEST, "Rank<%d>, time = %lfus", cudaDev, deltaSec * 1.0E6); Allreduce(args, &deltaSec, average); #if CUDART_VERSION >= 11030 @@ -732,11 +735,13 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } else { sprintf(timeStr, "%7.2f", timeUsec); } - if (datacheck) { - PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); - } else { - PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); - } + #ifndef NCCL_DEBUG_CLOCK + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + #endif args->bw[0] += busBw; args->bw_count[0]++; @@ -775,9 +780,12 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* // Benchmark for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); - print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + + #ifndef NCCL_DEBUG_CLOCK + print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + #endif TESTCHECK(BenchTime(args, type, op, root, 0)); - TESTCHECK(BenchTime(args, type, op, root, 1)); + // TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); } return testSuccess; @@ -1027,13 +1035,16 @@ testResult_t run() { #endif is_main_thread = (proc == 0) ? 1 : 0; - PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, - (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); - if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); - if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); - PRINT("#\n"); + #ifndef NCCL_DEBUG_CLOCK + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); + if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); + #endif - PRINT("# Using devices\n"); #define MAX_LINE 2048 char line[MAX_LINE]; int len = 0; @@ -1048,20 +1059,21 @@ testResult_t run() { maxMem = std::min(maxMem, prop.totalGlobalMem); } -#if MPI_SUPPORT - char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL; - // Gather all output in rank order to root (0) - MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD); - if (proc == 0) { - for (int p = 0; p < nProcs; p++) - PRINT("%s", lines+MAX_LINE*p); - free(lines); - } - MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD); -#else - PRINT("%s", line); +#ifndef NCCL_DEBUG_CLOCK + #if MPI_SUPPORT + char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL; + // Gather all output in rank order to root (0) + MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD); + if (proc == 0) { + for (int p = 0; p < nProcs; p++) + PRINT("%s", lines+MAX_LINE*p); + free(lines); + } + MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD); + #else + PRINT("%s", line); + #endif #endif - // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest. size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2); if (maxBytes > memMaxBytes) { @@ -1118,8 +1130,10 @@ testResult_t run() { errors[t] = bw_count[t] = 0; } - PRINT("#\n"); - print_header(); + #ifndef NCCL_DEBUG_CLOCK + PRINT("#\n"); + print_header(); + #endif int* sync = (int*)calloc(2, sizeof(int)); int* barrier = (int*)calloc(2, sizeof(int)); @@ -1199,9 +1213,14 @@ testResult_t run() { double check_avg_bw = str ? atof(str) : -1; bw[0] /= bw_count[0]; - PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); - PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); - PRINT("#\n"); + #ifndef NCCL_DEBUG_CLOCK + PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); + PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); + PRINT("#\n"); + #else + PRINT("\n"); + PRINT("\n"); + #endif #ifdef MPI_SUPPORT MPI_Finalize(); #endif diff --git a/src/common.h b/src/common.h index bd84d01..a6703b2 100644 --- a/src/common.h +++ b/src/common.h @@ -16,6 +16,10 @@ #include #include "nccl1_compat.h" +#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) + +// #define NCCL_DEBUG_CLOCK 1 + #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if( err != cudaSuccess ) { \ diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h index 020a4bc..32f04e6 100644 --- a/src/nccl1_compat.h +++ b/src/nccl1_compat.h @@ -3,7 +3,7 @@ * * See LICENSE.txt for license information ************************************************************************/ - +#include #ifndef NCCL1_COMPAT_H #define NCCL1_COMPAT_H @@ -14,8 +14,8 @@ #define ncclNumOps nccl_NUM_OPS #define ncclNumTypes nccl_NUM_TYPES -static ncclResult_t ncclGroupStart() { return ncclSuccess; } -static ncclResult_t ncclGroupEnd() { return ncclSuccess; } +static ncclResult_t ncclGroupStart() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; } +static ncclResult_t ncclGroupEnd() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; } #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument; diff --git a/src_inplace/Makefile b/src_inplace/Makefile new file mode 100644 index 0000000..840c997 --- /dev/null +++ b/src_inplace/Makefile @@ -0,0 +1,109 @@ +# +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +CUDA_HOME ?= /usr/local/cuda +PREFIX ?= /usr/local +VERBOSE ?= 0 +DEBUG_NT ?= 0 + +CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include +NVCC = $(CUDA_HOME)/bin/nvcc +CUDARTLIB ?= cudart + +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) +else +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) +endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt + +ifeq ($(DEBUG_NT), 0) +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else +.SILENT: +endif + +.PHONY: build clean + +BUILDDIR ?= ../build +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +ifeq ($(MPI), 1) +NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include +NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi +endif +ifeq ($(MPI_IBM),1) +NVCUFLAGS += -DMPI_SUPPORT +NVLDFLAGS += -lmpi_ibm +endif +LIBRARIES += nccl +NVLDFLAGS += $(LIBRARIES:%=-l%) + +$(info CARDNAME $(NVCUFLAGS)) + +DST_DIR := $(BUILDDIR) +SRC_FILES := $(wildcard *.cu) +OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) +BIN_FILES_LIST := ofccl_all_reduce_inp +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) + +build: ${BIN_FILES} + +clean: + rm -rf ${DST_DIR} + +${DST_DIR}/%.o: %.cu common_inplace.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_inplace.o + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + diff --git a/src_inplace/common_inplace.cu b/src_inplace/common_inplace.cu new file mode 100644 index 0000000..22cfecb --- /dev/null +++ b/src_inplace/common_inplace.cu @@ -0,0 +1,1477 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "common_inplace.h" +#include "cuda.h" +#include "nccl.h" +#include +#include +#include +#include +#include + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +#if NCCL_MAJOR >= 2 +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, + ncclUint8, + ncclInt32, + ncclUint32, + ncclInt64, + ncclUint64, + ncclHalf, + ncclFloat, + ncclDouble +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclBfloat16 +#endif +}; +const char *test_typenames[ncclNumTypes] = {"int8", + "uint8", + "int32", + "uint32", + "int64", + "uint64", + "half", + "float", + "double" +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + "bfloat16" +#endif +}; +int test_typenum = -1; + +const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; +ncclRedOp_t test_ops[] = { + ncclSum, + ncclProd, + ncclMax, + ncclMin +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclAvg +#endif +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + , + ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand +#endif +}; +int test_opnum = -1; +#else +ncclDataType_t test_types[ncclNumTypes] = { + ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; +const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", + "double", "int64", "uint64"}; +int test_typenum = 7; +const char *test_opnames[] = {"sum", "prod", "max", "min"}; +ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; +int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32 * 1024 * 1024; +static size_t maxBytes = 32 * 1024 * 1024; +static size_t stepBytes = 1 * 1024 * 1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = 1; +static int multi_iters = 1; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static thread_local CallBackArgs cbArgList[MAX_COLL_NUM]; +static thread_local int seenCqe[MAX_COLL_NUM]; + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024 * 1024 * 1024; + break; + case 'M': + case 'm': + units = 1024 * 1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch (type) { + case ncclHalf: + return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + return 1e-2; +#endif + case ncclFloat: + return 1e-5; + case ncclDouble: + return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + // case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: + return 1e-200; + } + return 1e-200; +} + +template __device__ double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template <> __device__ double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y - x)); +} + +template __device__ float toFloat(T a) { return (float)a; } +template <> __device__ float toFloat(half a) { return __half2float(a); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template +__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) { + const T *A = (const T *)A_; + const T *B = (const T *)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + double locmax = 0.0; + for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) { + + double delta = absDiff(A[i], B[i]); + if (delta > locmax) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) + printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i, + toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for (int stride = BSIZE / 2; stride > 1; stride >>= 1) { + __syncthreads(); + if (tid < stride) + temp[tid] = + temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride]; + } + __syncthreads(); + if (threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep + rank + offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template <> +__device__ double testValue(const size_t offset, const int rep, + const int rank) { + return 1.0 / (1.0 + (double)testValue(offset, rep, rank)); +} +template <> +__device__ float testValue(const size_t offset, const int rep, + const int rank) { + // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 + return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + // return 1.0 / 1.0; +} +template <> +__device__ half testValue(const size_t offset, const int rep, + const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, + const int rep, + const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template __device__ T ncclOpSum(T a, T b) { return a + b; } +template __device__ T ncclOpProd(T a, T b) { return a * b; } +template __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; } +template __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; } + +// Definitions for half +template <> __device__ half ncclOpSum(half a, half b) { + return __float2half(__half2float(a) + __half2float(b)); +} +template <> __device__ half ncclOpProd(half a, half b) { + return __float2half(__half2float(a) * __half2float(b)); +} +template <> __device__ half ncclOpMax(half a, half b) { + return __half2float(a) > __half2float(b) ? a : b; +} +template <> __device__ half ncclOpMin(half a, half b) { + return __half2float(a) < __half2float(b) ? a : b; +} + +template __device__ T ncclPPOpIdent(T x, int arg) { return x; } +template __device__ T ncclPPOpMul(T x, int arg) { + return x * T(arg); +} +template __device__ T ncclPPOpDiv(T x, int arg) { + return x / T(arg); +} +template <> __device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x) * float(arg)); +} +template <> __device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x) / n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x) * float(arg)); +} +template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x) / n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; } + +template +__global__ void InitDataReduceKernel(T *data, const size_t N, + const size_t offset, const int rep, + const int nranks) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) { + T val = testValue(o + offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i = 1; i < nranks; i++) { + T val1 = testValue(o + offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) \ + (void *)InitDataReduceKernel, preop, postop> +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), + OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void *data, const size_t count, const size_t offset, + ncclDataType_t type, ncclRedOp_t op, const int rep, + const int nranks) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep, + (void *)&nranks}; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid, + block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T *data, const size_t N, const int rep, + const int rank) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) + data[o] = testValue(o, rep, rank); +} + +static void *const initDataKerns[ncclNumTypes] = { + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + (void *)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void *dest, const size_t N, const int rep, + const int rank) { + T *ptr = (T *)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void *data, const size_t count, ncclDataType_t type, + const int rep, const int rank) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank}; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs *args) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + args->barrier_idx = !args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs *args, double *value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) + val += val2; + if (average == 2) + val = std::min(val, val2); + if (average == 3) + val = std::max(val, val2); + } + if (average || args->thread == 0) + args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1, + MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) + args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads; + args->reduce[1 - args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx = !args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams, + ncclComm_t *comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int *done = (int *)malloc(sizeof(int) * ngpus); + memset(done, 0, sizeof(int) * ngpus); + while (remaining) { + int idle = 1; + for (int i = 0; i < ngpus; i++) { + if (done[i]) + continue; + + cudaErr = cudaStreamQuery(streams[i]); + if (cudaErr == cudaSuccess) { + done[i] = 1; + remaining--; + idle = 0; + continue; + } + + if (cudaErr != cudaErrorNotReady) + CUDACHECK(cudaErr); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i = 0; i < ngpus; i++) + NCCLCHECK(ncclCommAbort(comms[i])); + // Abort the perf test + NCCLCHECK(ncclAsyncErr); + } + } +#endif + } + + // We might want to let other threads (including NCCL threads) use the CPU. + if (idle) + pthread_yield(); + } + free(done); + return testSuccess; +} + +testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + if (args->nGpus != 1) { + OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs"); + return testInternalError; + } + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); + + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + + return testSuccess; +} + +testResult_t startColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart"); + NCCLCHECK(ncclGroupStart()); + } + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm); +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(comm, &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + char *recvBuff = ((char *)args->recvbuffs[i]) + shift; + char *sendBuff = ((char *)args->sendbuffs[i]) + shift; + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + // miter就是collId。 + TESTCHECK(args->collTest->runColl( + (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank + : sendBuff), + (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank + : recvBuff), miter, cbArgList + miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd"); + NCCLCHECK(ncclGroupEnd()); + } + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) + Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs *args) { + if (blocking_coll) + return testSuccess; + + + int gotCqeCnt = 0; + while (gotCqeCnt < multi_iters) { + for (int i = 0; i < multi_iters; i++) { + pthread_mutex_lock(&cbArgList[i].mutex); + if (cbArgList[i].gotCqe == 1) { + if (seenCqe[i] == 0) { + gotCqeCnt++; + seenCqe[i] = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // if (cudaDev == 0) { + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i); + // } + + } + } + pthread_mutex_unlock(&cbArgList[i].mutex); + } + } + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) { + + size_t count = args->nbytes / wordSize(type); + + Barrier(args); + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < iters; iter++) { + + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, in_place, + iter * multi_iters + miter, miter, rankCtx)); + } + + TESTCHECK(completeColl(args)); + + int cudaDev; + cudaGetDevice(&cudaDev); + OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + } + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = + std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec / (iters * agg_iters *multi_iters); + if (cudaGraphLaunches >= 1) + deltaSec = deltaSec / cudaGraphLaunches; + Allreduce(args, &deltaSec, average); + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, + args->nProcs * args->nThreads * args->nGpus); + + Barrier(args); + + ofcclDestroy(rankCtx); + + double maxDelta = 0; + // static __thread int rep = 0; // 为了再次初始化buffer的参数,没用了。 + // rep++; + if (datacheck) { + + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec * 1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) { + int nranks = args->nProcs * args->nGpus * args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, + recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, + &sendInplaceOffset, &recvInplaceOffset, + (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, + const char *typeName, ncclRedOp_t op, const char *opName, + int root, bool is_ofccl) { + // if (is_ofccl) { + // 首先创建ofcclRankCtx_t + int thrdCudaDev; + CUDACHECK(cudaGetDevice(&thrdCudaDev)); + ofcclRankCtx_t rankCtx; + ofcclInitRankCtx(&rankCtx, thrdCudaDev); + + // prepare for all size. op, type traversed in the caller. + // TODO: if we support multi size, each size should use a separate ncclComm + for (size_t size = args->minbytes; size <= args->maxbytes; + size = ((args->stepfactor > 1) ? size * args->stepfactor + : size + args->stepbytes)) { + setupArgs(size, type, args); + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx)); + } + } + + // 在这里完成check数据的准备; + static __thread int rep = 0; + rep++; + if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。 + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), cudaDev); + } + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 + // ofcclFinalizeRankCtx7StartHostThrds(rankCtx); + // } + + // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 + // warmup还是需要开,不然ofccl性能拉胯。 + setupArgs(args->maxbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, 0, + iter * multi_iters + miter, miter, rankCtx)); + } + TESTCHECK(completeColl(args)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), cudaDev, iter, multi_iters); + } + + // Benchmark + // for (size_t size = args->minbytes; size <= args->maxbytes; + // size = ((args->stepfactor > 1) ? size * args->stepfactor + // : size + args->stepbytes)) { + // setupArgs(size, type, args); + print_line_header(max(args->sendBytes, args->expectedBytes), + args->nbytes / wordSize(type), typeName, opName, root); + // TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); + TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边,所以没办法在这里通过调用两次BenchTime来先做out-of-place,再做in-place。像这样的话,可以在BenchTime里加个循环。 + PRINT("\n"); + // } + + // if (is_ofccl) { + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self()); + // ofcclDestroy(rankCtx); // 为了做check,把这个挪到BenchTime里边。 + // } + + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadRunTests"); + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, + test_typenames[nccltype], + (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadInit"); + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs * args->nThreads * args->nGpus; + + // set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { + int rank = args->proc * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i = 0; i < args->nGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void *threadLauncher(void *thread_) { + struct testThread *thread = (struct testThread *)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread *thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, + size_t recvBytes, void **expected, size_t nbytes, + int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + // CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) + CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char *argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + ncclGetVersion(&test_ncclVersion); +#else + test_ncclVersion = NCCL_VERSION_CODE; +#endif +// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, +// test_ncclVersion); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) && + test_ncclVersion >= NCCL_VERSION(2, 10, 0)) { + test_opnum++; // ncclAvg +#if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 +#endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) && + test_ncclVersion >= NCCL_VERSION(2, 11, 0)) { + test_opnum++; // PreMulSum + } +#endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"multi_iters", required_argument, 0, 'M'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {}}; + + while (1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts, + &longindex); + + if (c == -1) + break; + + switch (c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'M': + multi_iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && \ + CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA " + "11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') + printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-M,--multi_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time " + "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, + "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), + MPI_BYTE, MPI_COMM_WORLD); + for (int p = 0; p < nProcs; p++) { + if (p == proc) + break; + if (hostHashs[p] == hostHashs[proc]) + localRank++; + } +#endif + is_main_thread = (proc == 0) ? 1 : 0; + + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup " + "iters: %d iters: %d validation: %d \n", + nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1) ? stepFactor : stepBytes, + (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck); + if (blocking_coll) + PRINT("# Blocking Enabled: wait for completion and barrier after each " + "collective \n"); + if (parallel_init) + PRINT("# Parallel Init Enabled: threads call into NcclInitRank " + "concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i = 0; i < nThreads * nGpus; i++) { + int cudaDev = localRank * nThreads * nGpus + i; + int rank = proc * nThreads * nGpus + i; + cudaDeviceProp prop; + CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); + len += + snprintf(line + len, MAX_LINE - len, + "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank, + getpid(), hostname, cudaDev, prop.pciBusID, prop.name); + maxMem = std::min(maxMem, prop.totalGlobalMem); + } + +#if MPI_SUPPORT + char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL; + // Gather all output in rank order to root (0) + MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, + MPI_COMM_WORLD); + if (proc == 0) { + for (int p = 0; p < nProcs; p++) + PRINT("%s", lines + MAX_LINE * p); + free(lines); + } + MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD); +#else + PRINT("%s", line); +#endif + + // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for + // the rest. + // size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2); + // if (maxBytes > memMaxBytes) { + // maxBytes = memMaxBytes; + // if (proc == 0) + // printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", + // maxBytes); + // } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus * nThreads]; + void *sendbuffs[nGpus * nThreads]; + void *recvbuffs[nGpus * nThreads]; + void *expected[nGpus * nThreads]; + size_t sendBytes, recvBytes; + + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, + (size_t)nProcs * nGpus * nThreads); + + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes, + expected + i, (size_t)maxBytes, + nProcs * nThreads * nGpus)); + CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking)); + } + + // if parallel init is not selected, use main thread to initialize NCCL + // TODO: assign more comms when use multi size. + ncclComm_t *comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + ncclComm_t *adjusted_comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + if (!parallel_init) { + if (nProcs == 1) { + int gpuArray[nGpus * nThreads]; + for (int i = 0; i < nGpus * nThreads; i++) + gpuArray[i] = i; + // OFTEST_LOG1(TEST, "CommInitAll here"); + // use seprate comm + // TODO: we do not support MPI now. + for (int miter = 0; miter < multi_iters; miter++) { + NCCLCHECK( + ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray)); + for (int tid = 0; tid < nThreads; tid++) { + memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus); + } + } + + // for (int miter = 0; miter < multi_iters; miter++) { + // for (int tid = 0; tid < nThreads; tid++) { + // OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus); + // } + // } + // for (int tid = 0; tid < nThreads; tid++) { + // for (int miter = 0; miter < multi_iters; miter++) { + // OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus); + // } + // } + } else { + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId, + proc * nThreads * nGpus + i)); + } + NCCLCHECK(ncclGroupEnd()); + } + } + + int errors[nThreads]; + double bw[nThreads]; + double *delta; + CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS, + cudaHostAllocPortable | cudaHostAllocMapped)); + int bw_count[nThreads]; + for (int t = 0; t < nThreads; t++) { + bw[t] = 0.0; + errors[t] = bw_count[t] = 0; + } + + PRINT("#\n"); + print_header(); + + int *sync = (int *)calloc(2, sizeof(int)); + int *barrier = (int *)calloc(2, sizeof(int)); + double *reduce = (double *)calloc(2, sizeof(double)); + + struct testThread threads[nThreads]; + memset(threads, 0, sizeof(struct testThread) * nThreads); + + for (int t = nThreads - 1; t >= 0; t--) { + threads[t].args.minbytes = minBytes; + threads[t].args.maxbytes = maxBytes; + // TODO: 不支持多个size。 + if (minBytes != maxBytes) { + OFTEST_LOG1(TEST_FATAL, "Only supports single size now"); + return testInternalError; + } + threads[t].args.stepbytes = stepBytes; + threads[t].args.stepfactor = stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs = nProcs; + threads[t].args.proc = proc; + threads[t].args.nThreads = nThreads; + threads[t].args.thread = t; + threads[t].args.nGpus = nGpus; + threads[t].args.sendbuffs = sendbuffs + t * nGpus; + threads[t].args.recvbuffs = sendbuffs + t * nGpus; + threads[t].args.expected = expected + t * nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus; + // for (int i = 0; i < multi_iters * nGpus; i++) { + // OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i); + // } + + threads[t].args.streams = streams + t * nGpus; + + threads[t].args.barrier = (volatile int *)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double *)reduce; + threads[t].args.sync = (volatile int *)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t * NUM_BLOCKS); + threads[t].args.errors = errors + t; + threads[t].args.bw = bw + t; + threads[t].args.bw_count = bw_count + t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads + t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t = nThreads - 1; t >= 0; t--) { + if (t) + pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for (int i = 0; i < nGpus * nThreads; ++i) + NCCLCHECK(ncclCommDestroy(comms[i])); + free(comms); + } + + // Free off CUDA allocated memory + for (int i = 0; i < nGpus * nThreads; i++) { + if (sendbuffs[i]) + CUDACHECK(cudaFree((char *)sendbuffs[i])); + // if (recvbuffs[i]) + // CUDACHECK(cudaFree((char *)recvbuffs[i])); + if (datacheck) + CUDACHECK(cudaFree(expected[i])); + } + CUDACHECK(cudaFreeHost(delta)); + + char *str = getenv("NCCL_TESTS_MIN_BW"); + double check_avg_bw = str ? atof(str) : -1; + bw[0] /= bw_count[0]; + + PRINT("# Out of bounds values : %d %s\n", errors[0], + errors[0] ? "FAILED" : "OK"); + PRINT("# Avg bus bandwidth : %g %s\n", bw[0], + check_avg_bw == -1 ? "" + : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK")); + PRINT("#\n"); +#ifdef MPI_SUPPORT + MPI_Finalize(); +#endif + + // 'cuda-memcheck --leak-check full' requires this + cudaDeviceReset(); + + if (errors[0] || bw[0] < check_avg_bw * (0.9)) + exit(EXIT_FAILURE); + else + exit(EXIT_SUCCESS); +} diff --git a/src_inplace/common_inplace.h b/src_inplace/common_inplace.h new file mode 100644 index 0000000..406f634 --- /dev/null +++ b/src_inplace/common_inplace.h @@ -0,0 +1,289 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#include "nccl.h" +#include +#include +#include +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +#include +#include "nccl1_compat.h" + +// #define DEBUG_PRINT 1 + +#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) +#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) +#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__) + +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test CUDA failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ + return testCudaError; \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ + return testNcclError; \ + } \ +} while(0) + +typedef enum { + testSuccess = 0, + testInternalError = 1, + testCudaError = 2, + testNcclError = 3, +} testResult_t; + +// Relay errors up and trace +#define TESTCHECK(cmd) do { \ + testResult_t r = cmd; \ + if (r!= testSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf(" .. %s pid %d: Test failure %s:%d\n", \ + hostname, getpid(), \ + __FILE__,__LINE__); \ + return r; \ + } \ +} while(0) + +typedef struct { + int collId; + int gotCqe; + pthread_mutex_t mutex; +} CallBackArgs; + +#define MAX_COLL_NUM 10000 + +struct testColl { + const char name[20]; + void (*getCollByteCount)( + size_t *sendcount, size_t *recvcount, size_t *paramcount, + size_t *sendInplaceOffset, size_t *recvInplaceOffset, + size_t count, int nranks); + testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, + ncclRedOp_t op, int root, int rep, int in_place); + void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx); + testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx); +}; +extern struct testColl allReduceTest; +extern struct testColl allGatherTest; +extern struct testColl reduceScatterTest; +extern struct testColl broadcastTest; +extern struct testColl reduceTest; +extern struct testColl alltoAllTest; + +struct testEngine { + void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); + testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, + const char* typeName, ncclRedOp_t op, const char* opName); +}; + +extern struct testEngine ncclTestEngine; + +struct threadArgs { + size_t nbytes; + size_t minbytes; + size_t maxbytes; + size_t stepbytes; + size_t stepfactor; + + int nProcs; + int proc; + int nThreads; + int thread; + int nGpus; + int localRank; + void** sendbuffs; + size_t sendBytes; + size_t sendInplaceOffset; + void** recvbuffs; + size_t recvInplaceOffset; + ncclUniqueId ncclId; + ncclComm_t* comms; + cudaStream_t* streams; + + void** expected; + size_t expectedBytes; + volatile int* sync; + int sync_idx; + volatile int* barrier; + int barrier_idx; + volatile double* reduce; + int syncRank; + int syncNranks; + double* deltaHost; + int* errors; + double* bw; + int* bw_count; + + int reportErrors; + + struct testColl* collTest; +}; + +typedef testResult_t (*threadFunc_t)(struct threadArgs* args); +struct testThread { + pthread_t thread; + threadFunc_t func; + struct threadArgs args; + testResult_t ret; +}; + +#include + +// Provided by common.cu +extern void Barrier(struct threadArgs* args); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, bool is_ofccl=false); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); +extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); + +// Provided by each coll +extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); +extern void print_header(); + +#include + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + +#include + +static uint64_t getHash(const char* string, size_t n) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (size_t c = 0; c < n; c++) { + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + +static size_t wordSize(ncclDataType_t type) { + switch(type) { + case ncclChar: +#if NCCL_MAJOR >= 2 + //case ncclInt8: + case ncclUint8: +#endif + return 1; + case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif + //case ncclFloat16: + return 2; + case ncclInt: + case ncclFloat: +#if NCCL_MAJOR >= 2 + //case ncclInt32: + case ncclUint32: + //case ncclFloat32: +#endif + return 4; + case ncclInt64: + case ncclUint64: + case ncclDouble: + //case ncclFloat64: + return 8; + default: return 0; + } +} + +extern int test_ncclVersion; // init'd with ncclGetVersion() +constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); +extern int test_opnum; +extern int test_typenum; +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[]; +extern const char *test_opnames[]; + +static int ncclstringtotype(char *str) { + for (int t=0; t INT_MAX) return ncclInvalidArgument; + +static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclBcast(buff, (int)count, datatype, root, comm, stream); +} +static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + CHECKCOUNT(recvcount); + return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(sendcount); + return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif diff --git a/src_inplace/ofccl_all_reduce_inp.cu b/src_inplace/ofccl_all_reduce_inp.cu new file mode 100644 index 0000000..9123391 --- /dev/null +++ b/src_inplace/ofccl_all_reduce_inp.cu @@ -0,0 +1,159 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common_inplace.h" +#include +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n"); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev); + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); + return 0; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + + // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + + NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl, + AllReducePrepare +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) +else +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) +endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt + +ifeq ($(DEBUG_NT), 0) +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else +.SILENT: +endif + +.PHONY: build clean + +BUILDDIR ?= ../build +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +ifeq ($(MPI), 1) +NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include +NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi +endif +ifeq ($(MPI_IBM),1) +NVCUFLAGS += -DMPI_SUPPORT +NVLDFLAGS += -lmpi_ibm +endif +LIBRARIES += nccl +NVLDFLAGS += $(LIBRARIES:%=-l%) + +$(info CARDNAME $(NVCUFLAGS)) + +DST_DIR := $(BUILDDIR) +SRC_FILES := $(wildcard *.cu) +OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) +BIN_FILES_LIST := ofccl_all_reduce_ms +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) + +build: ${BIN_FILES} + +clean: + rm -rf ${DST_DIR} + +${DST_DIR}/%.o: %.cu common_ms.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_ms.o + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu new file mode 100644 index 0000000..2b8146c --- /dev/null +++ b/src_manual_size/common_ms.cu @@ -0,0 +1,1578 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "common_ms.h" +#include "cuda.h" +#include "nccl.h" +#include +#include +#include +#include +#include + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +#ifdef FULL_MS + size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096}; + #ifndef IN_ORDER + int idxList[8][MULTI_ITERS] = { + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + }, + {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88}, + {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88 + } + }; + #else + int idxList[8][MULTI_ITERS] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + }, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160 + } + }; + #endif +#else + // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512}; + // size_t idxList[8][MULTI_ITERS] = { + // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + // {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15}, + // {4, 5, 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + // {0, 1, 2, 3, 8, 4, 5, 6, 9, 10, 11, 7, 12, 13, 14, 15}, + // {0, 1, 2, 3, 8, 4, 5, 6, 9, 10, 11, 7, 12, 13, 14, 15}, + // {4, 2, 3, 6, 7, 8, 5, 0, 1, 9, 10, 11, 12, 13, 14, 15}, + // {4, 2, 3, 1, 9, 10, 11, 6, 7, 8, 5, 0, 12, 13, 14, 15}, + // {4, 2, 3, 1, 9, 5, 0, 12, 13, 14, 10, 11, 6, 7, 8, 15} + // // {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0} + // }; + + // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 1048576}; + // size_t idxList[8][MULTI_ITERS] = { + // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + // {0, 2, 1, 3, 5, 4, 6, 9, 8, 7}, + // {3, 2, 5, 6, 4, 7, 1, 9, 8, 0}, + // {1, 2, 4, 5, 7, 6, 8, 9, 3, 0}, + // {2, 0, 5, 7, 4, 8, 9, 6, 3, 1}, + // {3, 4, 8, 2, 1, 0, 5, 7, 9, 6}, + // {1, 3, 9, 2, 4, 7, 8, 0, 5, 6}, + // {2, 6, 8, 1, 3, 0, 4, 5, 7, 9} + // }; + size_t countList[MULTI_ITERS] = {256, 147456}; + size_t idxList[8][MULTI_ITERS] = { + {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1}, + // {0, 1} + + {1, 0}, + {1, 0}, + {0, 1}, + {1, 0}, + {0, 1}, + {1, 0}, + {0, 1} + }; +#endif + +size_t sendBytesList[MULTI_ITERS]; +size_t recvBytesList[MULTI_ITERS]; + +#if NCCL_MAJOR >= 2 +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, + ncclUint8, + ncclInt32, + ncclUint32, + ncclInt64, + ncclUint64, + ncclHalf, + ncclFloat, + ncclDouble +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclBfloat16 +#endif +}; +const char *test_typenames[ncclNumTypes] = {"int8", + "uint8", + "int32", + "uint32", + "int64", + "uint64", + "half", + "float", + "double" +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + "bfloat16" +#endif +}; +int test_typenum = -1; + +const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; +ncclRedOp_t test_ops[] = { + ncclSum, + ncclProd, + ncclMax, + ncclMin +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclAvg +#endif +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + , + ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand +#endif +}; +int test_opnum = -1; +#else +ncclDataType_t test_types[ncclNumTypes] = { + ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; +const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", + "double", "int64", "uint64"}; +int test_typenum = 7; +const char *test_opnames[] = {"sum", "prod", "max", "min"}; +ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; +int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32 * 1024 * 1024; +static size_t maxBytes = 32 * 1024 * 1024; +static size_t stepBytes = 1 * 1024 * 1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = 1; +static int multi_iters = MULTI_ITERS; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static thread_local CallBackArgs cbArgList[MAX_COLL_NUM]; +static thread_local int seenCqe[MAX_COLL_NUM]; + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024 * 1024 * 1024; + break; + case 'M': + case 'm': + units = 1024 * 1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch (type) { + case ncclHalf: + return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + return 1e-2; +#endif + case ncclFloat: + return 1e-5; + case ncclDouble: + return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + // case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: + return 1e-200; + } + return 1e-200; +} + +template __device__ double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template <> __device__ double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y - x)); +} + +template __device__ float toFloat(T a) { return (float)a; } +template <> __device__ float toFloat(half a) { return __half2float(a); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template +__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) { + const T *A = (const T *)A_; + const T *B = (const T *)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + double locmax = 0.0; + for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) { + + double delta = absDiff(A[i], B[i]); + if (delta > locmax) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) + printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i, + toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for (int stride = BSIZE / 2; stride > 1; stride >>= 1) { + __syncthreads(); + if (tid < stride) + temp[tid] = + temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride]; + } + __syncthreads(); + if (threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep + rank + offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template <> +__device__ double testValue(const size_t offset, const int rep, + const int rank) { + return 1.0 / (1.0 + (double)testValue(offset, rep, rank)); +} +template <> +__device__ float testValue(const size_t offset, const int rep, + const int rank) { + // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 + return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + // return 1.0 / 1.0; +} +template <> +__device__ half testValue(const size_t offset, const int rep, + const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, + const int rep, + const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template __device__ T ncclOpSum(T a, T b) { return a + b; } +template __device__ T ncclOpProd(T a, T b) { return a * b; } +template __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; } +template __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; } + +// Definitions for half +template <> __device__ half ncclOpSum(half a, half b) { + return __float2half(__half2float(a) + __half2float(b)); +} +template <> __device__ half ncclOpProd(half a, half b) { + return __float2half(__half2float(a) * __half2float(b)); +} +template <> __device__ half ncclOpMax(half a, half b) { + return __half2float(a) > __half2float(b) ? a : b; +} +template <> __device__ half ncclOpMin(half a, half b) { + return __half2float(a) < __half2float(b) ? a : b; +} + +template __device__ T ncclPPOpIdent(T x, int arg) { return x; } +template __device__ T ncclPPOpMul(T x, int arg) { + return x * T(arg); +} +template __device__ T ncclPPOpDiv(T x, int arg) { + return x / T(arg); +} +template <> __device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x) * float(arg)); +} +template <> __device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x) / n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x) * float(arg)); +} +template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x) / n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; } + +template +__global__ void InitDataReduceKernel(T *data, const size_t N, + const size_t offset, const int rep, + const int nranks) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) { + T val = testValue(o + offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i = 1; i < nranks; i++) { + T val1 = testValue(o + offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) \ + (void *)InitDataReduceKernel, preop, postop> +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), + OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void *data, const size_t count, const size_t offset, + ncclDataType_t type, ncclRedOp_t op, const int rep, + const int nranks) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep, + (void *)&nranks}; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid, + block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T *data, const size_t N, const int rep, + const int rank) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) + data[o] = testValue(o, rep, rank); +} + +static void *const initDataKerns[ncclNumTypes] = { + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + (void *)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void *dest, const size_t N, const int rep, + const int rank) { + T *ptr = (T *)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void *data, const size_t count, ncclDataType_t type, + const int rep, const int rank) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank}; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs *args) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + args->barrier_idx = !args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs *args, double *value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) + val += val2; + if (average == 2) + val = std::min(val, val2); + if (average == 3) + val = std::max(val, val2); + } + if (average || args->thread == 0) + args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1, + MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) + args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads; + args->reduce[1 - args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx = !args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams, + ncclComm_t *comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int *done = (int *)malloc(sizeof(int) * ngpus); + memset(done, 0, sizeof(int) * ngpus); + while (remaining) { + int idle = 1; + for (int i = 0; i < ngpus; i++) { + if (done[i]) + continue; + + cudaErr = cudaStreamQuery(streams[i]); + if (cudaErr == cudaSuccess) { + done[i] = 1; + remaining--; + idle = 0; + continue; + } + + if (cudaErr != cudaErrorNotReady) + CUDACHECK(cudaErr); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i = 0; i < ngpus; i++) + NCCLCHECK(ncclCommAbort(comms[i])); + // Abort the perf test + NCCLCHECK(ncclAsyncErr); + } + } +#endif + } + + // We might want to let other threads (including NCCL threads) use the CPU. + if (idle) + pthread_yield(); + } + free(done); + return testSuccess; +} + +testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + if (args->nGpus != 1) { + OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs"); + return testInternalError; + } + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); + + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + + return testSuccess; +} + +testResult_t startColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart"); + NCCLCHECK(ncclGroupStart()); + } + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm); +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(comm, &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + // char *recvBuff = ((char *)args->recvbuffs[i]) + shift; + // char *sendBuff = ((char *)args->sendbuffs[i]) + shift; + char *recvBuff = (char *)(args->recvbuffs[miter]); + char *sendBuff = (char *)(args->sendbuffs[miter]); + + // int cudaDev; + // cudaGetDevice(&cudaDev); + // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, RUN sendbuff @ %p, recvbuff @ %p", cudaDev, miter, sendBuff, recvBuff); + + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + // miter就是collId。 + TESTCHECK(args->collTest->runColl( + (void *)(sendBuff), + (void *)(recvBuff), miter, cbArgList + miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd"); + NCCLCHECK(ncclGroupEnd()); + } + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) + Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs *args, int iter=0) { + if (blocking_coll) + return testSuccess; + + + int gotCqeCnt = 0; + while (gotCqeCnt < multi_iters) { + for (int i = 0; i < multi_iters; i++) { + pthread_mutex_lock(&cbArgList[i].mutex); + if (cbArgList[i].gotCqe == 1) { + if (seenCqe[i] == 0) { + gotCqeCnt++; + seenCqe[i] = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get %dth cqe for coll_id = %d", pthread_self(), cudaDev, iter, i); + + } + } + pthread_mutex_unlock(&cbArgList[i].mutex); + } + } + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) { + + int cudaDev; + cudaGetDevice(&cudaDev); + + size_t count = args->nbytes / wordSize(type); + + Barrier(args); + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 1; iter <= iters; iter++) { + // 在这个地方改变miter的遍历顺序,起到乱序调用的作用。 + for (int miter_idx = 0; miter_idx < multi_iters; miter_idx++) { // for (int miter = 0; miter < multi_iters; miter++) { + int miter = idxList[cudaDev][miter_idx]; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke %dth startColl iter for coll_id = %d", pthread_self(), cudaDev, iter, miter); + seenCqe[miter] = 0; + usleep(200); + TESTCHECK(startColl(args, type, op, root, in_place, + iter * multi_iters + miter, miter, rankCtx)); + } + + TESTCHECK(completeColl(args, iter)); + + usleep(100000); + OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + } + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = + std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec / (iters * agg_iters *multi_iters); + if (cudaGraphLaunches >= 1) + deltaSec = deltaSec / cudaGraphLaunches; + Allreduce(args, &deltaSec, average); + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, + args->nProcs * args->nThreads * args->nGpus); + + Barrier(args); + + ofcclDestroy(rankCtx); + + double maxDelta = 0; + // static __thread int rep = 0; // 为了再次初始化buffer的参数,没用了。 + // rep++; + if (datacheck) { + + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec * 1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) { + int nranks = args->nProcs * args->nGpus * args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, + recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, + &sendInplaceOffset, &recvInplaceOffset, + (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, + const char *typeName, ncclRedOp_t op, const char *opName, + int root, bool is_ofccl) { + // 首先创建ofcclRankCtx_t + int thrdCudaDev; + CUDACHECK(cudaGetDevice(&thrdCudaDev)); + ofcclRankCtx_t rankCtx; + ofcclInitRankCtx(&rankCtx, thrdCudaDev); + + // prepare for all size. op, type traversed in the caller. + // TODO: if we support multi size, each size should use a separate ncclComm + + for (int miter = 0; miter < multi_iters; miter++) { + args->nbytes = sendBytesList[miter]; + args->sendBytes = args->nbytes; + TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx)); + } + + // 在这里完成check数据的准备; + static __thread int rep = 0; + rep++; + if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。 + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0)); + + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev); + } + + // ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 + ofcclFinalizeRankCtx7StartHostThrds(rankCtx); + + // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 + // warmup还是需要开,不然ofccl性能拉胯。 + for (int iter = 0; iter < warmup_iters; iter++) { + for (int miter = 0; miter < multi_iters; miter++) { + args->nbytes = sendBytesList[miter]; + args->sendBytes = args->nbytes; + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, 0, + iter * multi_iters + miter, miter, rankCtx)); + } + TESTCHECK(completeColl(args)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), thrdCudaDev, iter, multi_iters); + } + + print_line_header(max(args->sendBytes, args->expectedBytes), + args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); + // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边,所以没办法在这里通过调用两次BenchTime来先做out-of-place,再做in-place。像这样的话,可以在BenchTime里加个循环。 + PRINT("\n"); + + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadRunTests"); + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, + test_typenames[nccltype], + (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadInit"); + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs * args->nThreads * args->nGpus; + + // set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { + int rank = args->proc * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i = 0; i < args->nGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void *threadLauncher(void *thread_) { + struct testThread *thread = (struct testThread *)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread *thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, + size_t recvBytes, void **expected, size_t nbytes, + int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) + CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t AllocateBuffLists(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes) { + CUDACHECK(cudaMalloc(sendbuff, sendBytes)); + CUDACHECK(cudaMalloc(recvbuff, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char *argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + ncclGetVersion(&test_ncclVersion); +#else + test_ncclVersion = NCCL_VERSION_CODE; +#endif +// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, +// test_ncclVersion); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) && + test_ncclVersion >= NCCL_VERSION(2, 10, 0)) { + test_opnum++; // ncclAvg +#if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 +#endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) && + test_ncclVersion >= NCCL_VERSION(2, 11, 0)) { + test_opnum++; // PreMulSum + } +#endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"multi_iters", required_argument, 0, 'M'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {}}; + + while (1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts, + &longindex); + + if (c == -1) + break; + + switch (c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'M': + // multi_iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && \ + CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA " + "11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') + printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-M,--multi_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time " + "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, + "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), + MPI_BYTE, MPI_COMM_WORLD); + for (int p = 0; p < nProcs; p++) { + if (p == proc) + break; + if (hostHashs[p] == hostHashs[proc]) + localRank++; + } +#endif + is_main_thread = (proc == 0) ? 1 : 0; + + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup " + "iters: %d iters: %d validation: %d \n", + nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1) ? stepFactor : stepBytes, + (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck); + if (blocking_coll) + PRINT("# Blocking Enabled: wait for completion and barrier after each " + "collective \n"); + if (parallel_init) + PRINT("# Parallel Init Enabled: threads call into NcclInitRank " + "concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + if (multi_iters != MULTI_ITERS) { + OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters); + } + OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i = 0; i < nThreads * nGpus; i++) { + int cudaDev = localRank * nThreads * nGpus + i; + int rank = proc * nThreads * nGpus + i; + cudaDeviceProp prop; + CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); + len += + snprintf(line + len, MAX_LINE - len, + "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank, + getpid(), hostname, cudaDev, prop.pciBusID, prop.name); + maxMem = std::min(maxMem, prop.totalGlobalMem); + } + +#if MPI_SUPPORT + char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL; + // Gather all output in rank order to root (0) + MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, + MPI_COMM_WORLD); + if (proc == 0) { + for (int p = 0; p < nProcs; p++) + PRINT("%s", lines + MAX_LINE * p); + free(lines); + } + MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD); +#else + PRINT("%s", line); +#endif + + // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for + // the rest. + size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2); + if (maxBytes > memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) + printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", + maxBytes); + } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus * nThreads]; + void *sendbuffs[nGpus * nThreads][MULTI_ITERS]; + void *recvbuffs[nGpus * nThreads][MULTI_ITERS]; + void *expected[nGpus * nThreads]; + // size_t sendBytes, recvBytes; + + // ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, + // (size_t)nProcs * nGpus * nThreads); + + ncclTestEngine.getCollByteCountList(sendBytesList, recvBytesList, countList, multi_iters); + // for (int i = 0; i < MULTI_ITERS; i++) { + // OFTEST_LOG(TEST, "sendBytesList[%d] = %lu, recvBytesList[%d] = %lu", i, sendBytesList[i], i, recvBytesList[i]); + // } + + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + // 这里的调用是给每个线程分配。 + // TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes, + // expected + i, (size_t)maxBytes, + // nProcs * nThreads * nGpus)); + CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking)); + + for (int j = 0; j < multi_iters; j++) { + AllocateBuffLists(&sendbuffs[i][j], sendBytesList[j], &recvbuffs[i][j], recvBytesList[j]); + + // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, ALLOCATE sendbuff @ %p, recvbuff @ %p", i, j, sendbuffs[i][j], recvbuffs[i][j]); + } + } + + // if parallel init is not selected, use main thread to initialize NCCL + // TODO: assign more comms when use multi size. + ncclComm_t *comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + ncclComm_t *adjusted_comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + if (!parallel_init) { + if (nProcs == 1) { + int gpuArray[nGpus * nThreads]; + for (int i = 0; i < nGpus * nThreads; i++) + gpuArray[i] = i; + // OFTEST_LOG1(TEST, "CommInitAll here"); + // use seprate comm + // TODO: we do not support MPI now. + for (int miter = 0; miter < multi_iters; miter++) { + NCCLCHECK( + ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray)); + for (int tid = 0; tid < nThreads; tid++) { + memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus); + } + } + + // for (int miter = 0; miter < multi_iters; miter++) { + // for (int tid = 0; tid < nThreads; tid++) { + // OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus); + // } + // } + // for (int tid = 0; tid < nThreads; tid++) { + // for (int miter = 0; miter < multi_iters; miter++) { + // OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus); + // } + // } + } else { + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId, + proc * nThreads * nGpus + i)); + } + NCCLCHECK(ncclGroupEnd()); + } + } + + int errors[nThreads]; + double bw[nThreads]; + double *delta; + CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS, + cudaHostAllocPortable | cudaHostAllocMapped)); + int bw_count[nThreads]; + for (int t = 0; t < nThreads; t++) { + bw[t] = 0.0; + errors[t] = bw_count[t] = 0; + } + + PRINT("#\n"); + print_header(); + + int *sync = (int *)calloc(2, sizeof(int)); + int *barrier = (int *)calloc(2, sizeof(int)); + double *reduce = (double *)calloc(2, sizeof(double)); + + struct testThread threads[nThreads]; + memset(threads, 0, sizeof(struct testThread) * nThreads); + + for (int t = nThreads - 1; t >= 0; t--) { + threads[t].args.minbytes = minBytes; + threads[t].args.maxbytes = maxBytes; + // TODO: 不支持多个size。 + if (minBytes != maxBytes) { + OFTEST_LOG1(TEST_FATAL, "Only supports single size now"); + return testInternalError; + } + threads[t].args.stepbytes = stepBytes; + threads[t].args.stepfactor = stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs = nProcs; + threads[t].args.proc = proc; + threads[t].args.nThreads = nThreads; + threads[t].args.thread = t; + threads[t].args.nGpus = nGpus; + // threads[t].args.sendbuffs = sendbuffs[t]; + // threads[t].args.recvbuffs = recvbuffs[t]; + for (int j = 0; j < MULTI_ITERS; j++) { + threads[t].args.sendbuffs[j] = sendbuffs[t][j]; + threads[t].args.recvbuffs[j] = recvbuffs[t][j]; + // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, DISPATCH SRC sendbuff @ %p, recvbuff @ %p", t, j, sendbuffs[t][j], recvbuffs[t][j]); + // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, DISPATCH IN ARGS sendbuff @ %p, recvbuff @ %p", t, j, threads[t].args.sendbuffs[j], threads[t].args.recvbuffs[j]); + } + threads[t].args.expected = expected + t * nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus; + // for (int i = 0; i < multi_iters * nGpus; i++) { + // OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i); + // } + + threads[t].args.streams = streams + t * nGpus; + + threads[t].args.barrier = (volatile int *)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double *)reduce; + threads[t].args.sync = (volatile int *)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t * NUM_BLOCKS); + threads[t].args.errors = errors + t; + threads[t].args.bw = bw + t; + threads[t].args.bw_count = bw_count + t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads + t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t = nThreads - 1; t >= 0; t--) { + if (t) + pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for (int i = 0; i < nGpus * nThreads; ++i) + NCCLCHECK(ncclCommDestroy(comms[i])); + free(comms); + } + + // Free off CUDA allocated memory + for (int i = 0; i < nGpus * nThreads; i++) { + for (int j = 0; j < MULTI_ITERS; j++) { + CUDACHECK(cudaFree((char *)sendbuffs[i][j])); + CUDACHECK(cudaFree((char *)recvbuffs[i][j])); + } + } + CUDACHECK(cudaFreeHost(delta)); + + char *str = getenv("NCCL_TESTS_MIN_BW"); + double check_avg_bw = str ? atof(str) : -1; + bw[0] /= bw_count[0]; + + PRINT("# Out of bounds values : %d %s\n", errors[0], + errors[0] ? "FAILED" : "OK"); + PRINT("# Avg bus bandwidth : %g %s\n", bw[0], + check_avg_bw == -1 ? "" + : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK")); + PRINT("#\n"); +#ifdef MPI_SUPPORT + MPI_Finalize(); +#endif + + // 'cuda-memcheck --leak-check full' requires this + cudaDeviceReset(); + + if (errors[0] || bw[0] < check_avg_bw * (0.9)) + exit(EXIT_FAILURE); + else + exit(EXIT_SUCCESS); +} diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h new file mode 100644 index 0000000..14f0ffb --- /dev/null +++ b/src_manual_size/common_ms.h @@ -0,0 +1,303 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#include "nccl.h" +#include +#include +#include +#include // usleep +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +#include +#include "nccl1_compat.h" + +// #define DEBUG_PRINT 1 + +#define FULL_MS 1 + +#ifdef FULL_MS + #define MULTI_ITERS 161 +#else + // #define MULTI_ITERS 16 + #define MULTI_ITERS 2 +#endif + +// #define IN_ORDER 1 + +#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) +#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) +#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__) + +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test CUDA failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ + return testCudaError; \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ + return testNcclError; \ + } \ +} while(0) + +typedef enum { + testSuccess = 0, + testInternalError = 1, + testCudaError = 2, + testNcclError = 3, +} testResult_t; + +// Relay errors up and trace +#define TESTCHECK(cmd) do { \ + testResult_t r = cmd; \ + if (r!= testSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf(" .. %s pid %d: Test failure %s:%d\n", \ + hostname, getpid(), \ + __FILE__,__LINE__); \ + return r; \ + } \ +} while(0) + +typedef struct { + int collId; + int gotCqe; + int cudaDev; + pthread_mutex_t mutex; +} CallBackArgs; + +#define MAX_COLL_NUM 10000 + +struct testColl { + const char name[20]; + void (*getCollByteCount)( + size_t *sendcount, size_t *recvcount, size_t *paramcount, + size_t *sendInplaceOffset, size_t *recvInplaceOffset, + size_t count, int nranks); + testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, + ncclRedOp_t op, int root, int rep, int in_place); + void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx); + testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx); +}; +extern struct testColl allReduceTest; +extern struct testColl allGatherTest; +extern struct testColl reduceScatterTest; +extern struct testColl broadcastTest; +extern struct testColl reduceTest; +extern struct testColl alltoAllTest; + +struct testEngine { + void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); + testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, + const char* typeName, ncclRedOp_t op, const char* opName); + void (*getCollByteCountList)(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen); +}; + +extern struct testEngine ncclTestEngine; + +struct threadArgs { + size_t nbytes; + size_t minbytes; + size_t maxbytes; + size_t stepbytes; + size_t stepfactor; + + int nProcs; + int proc; + int nThreads; + int thread; + int nGpus; + int localRank; + void* sendbuffs[MULTI_ITERS]; + size_t sendBytes; + size_t sendInplaceOffset; + void* recvbuffs[MULTI_ITERS]; + size_t recvInplaceOffset; + ncclUniqueId ncclId; + ncclComm_t* comms; + cudaStream_t* streams; + + void** expected; + size_t expectedBytes; + volatile int* sync; + int sync_idx; + volatile int* barrier; + int barrier_idx; + volatile double* reduce; + int syncRank; + int syncNranks; + double* deltaHost; + int* errors; + double* bw; + int* bw_count; + + int reportErrors; + + struct testColl* collTest; +}; + +typedef testResult_t (*threadFunc_t)(struct threadArgs* args); +struct testThread { + pthread_t thread; + threadFunc_t func; + struct threadArgs args; + testResult_t ret; +}; + +#include + +// Provided by common.cu +extern void Barrier(struct threadArgs* args); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, bool is_ofccl=false); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); +extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); + +// Provided by each coll +extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); +extern void print_header(); + +#include + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + +#include + +static uint64_t getHash(const char* string, size_t n) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (size_t c = 0; c < n; c++) { + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + +static size_t wordSize(ncclDataType_t type) { + switch(type) { + case ncclChar: +#if NCCL_MAJOR >= 2 + //case ncclInt8: + case ncclUint8: +#endif + return 1; + case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif + //case ncclFloat16: + return 2; + case ncclInt: + case ncclFloat: +#if NCCL_MAJOR >= 2 + //case ncclInt32: + case ncclUint32: + //case ncclFloat32: +#endif + return 4; + case ncclInt64: + case ncclUint64: + case ncclDouble: + //case ncclFloat64: + return 8; + default: return 0; + } +} + +extern int test_ncclVersion; // init'd with ncclGetVersion() +constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); +extern int test_opnum; +extern int test_typenum; +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[]; +extern const char *test_opnames[]; + +static int ncclstringtotype(char *str) { + for (int t=0; t INT_MAX) return ncclInvalidArgument; + +static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclBcast(buff, (int)count, datatype, root, comm, stream); +} +static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + CHECKCOUNT(recvcount); + return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(sendcount); + return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu new file mode 100644 index 0000000..ccde169 --- /dev/null +++ b/src_manual_size/ofccl_all_reduce_ms.cu @@ -0,0 +1,175 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common_ms.h" +#include +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n"); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + int cudaDev; + cudaGetDevice(&cudaDev); + OFTEST_LOG(TEST, "Hi <%lu> Rank<%d>, sendcount = %p, recvcount = %p, paramcount = %p, sendInplaceOffset = %p, recvInplaceOffset = %p, count = %lu, nranks = %d", pthread_self(), cudaDev, sendcount, recvcount, paramcount, sendInplaceOffset, recvInplaceOffset, count, nranks); + + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +void AllReduceGetCollByteCountList(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen) { // listLen就等于multi_iter + // OFTEST_LOG1(TEST, "hi"); + for (int i = 0; i < listLen; i++) { + *(sendCntList + i) = *(countList + i); + *(recvCntList + i) = *(countList + i); + } +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev); + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); // 这个函数之后在poller线程里调用的,所以这个获得的dev应该是不对的。 + + // int collId = ((CallBackArgs *)args)->collId; + // int cudaDev = ((CallBackArgs *)args)->cudaDev; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); + + return 0; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + + args->collId = collId; + args->gotCqe = 0; + args->cudaDev = cudaDev; + pthread_mutex_init(&args->mutex, NULL); + + NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d", pthread_self(), cudaDev, collId); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl, + AllReducePrepare +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) +else +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) +endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt + +ifeq ($(DEBUG_NT), 0) +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else +.SILENT: +endif + +.PHONY: build clean + +BUILDDIR ?= ../build +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +ifeq ($(MPI), 1) +NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include +NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi +endif +ifeq ($(MPI_IBM),1) +NVCUFLAGS += -DMPI_SUPPORT +NVLDFLAGS += -lmpi_ibm +endif +LIBRARIES += nccl +NVLDFLAGS += $(LIBRARIES:%=-l%) + +$(info CARDNAME $(NVCUFLAGS)) + +DST_DIR := $(BUILDDIR) +SRC_FILES := $(wildcard *.cu) +OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) +BIN_FILES_LIST := all_reduce_nccl_ms +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) + +build: ${BIN_FILES} + +clean: + rm -rf ${DST_DIR} + +${DST_DIR}/%.o: %.cu common_nccl_ms.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_nccl_ms.o + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + diff --git a/src_nccl_manual_size/all_reduce_nccl_ms.cu b/src_nccl_manual_size/all_reduce_nccl_ms.cu new file mode 100644 index 0000000..7bab5c2 --- /dev/null +++ b/src_nccl_manual_size/all_reduce_nccl_ms.cu @@ -0,0 +1,123 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common_nccl_ms.h" + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +void AllReduceGetCollByteCountList(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen) { // listLen就等于agg_iters + // OFTEST_LOG1(TEST, "hi"); + for (int i = 0; i < listLen; i++) { + *(sendCntList + i) = *(countList + i); + *(recvCntList + i) = *(countList + i); + } +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i +#include +#include +#include +#include "cuda.h" + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +// TODO: 丑丑地搞个全局变量 +// size_t countList[AGG_ITERS] = {4000, 8192000}; +size_t countList[AGG_ITERS] = {4000, 8192000}; +size_t sendBytesList[AGG_ITERS]; +size_t recvBytesList[AGG_ITERS]; +// ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat}; +ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat}; +int idxList[8][AGG_ITERS] = { + {0, 1}, + {1, 0} +}; + +#if NCCL_MAJOR >= 2 + ncclDataType_t test_types[ncclNumTypes] = { + ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclBfloat16 + #endif + }; + const char *test_typenames[ncclNumTypes] = { + "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , "bfloat16" + #endif + }; + int test_typenum = -1; + + const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclAvg + #endif + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand + #endif + }; + int test_opnum = -1; +#else + ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; + const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; + int test_typenum = 7; + const char *test_opnames[] = {"sum", "prod", "max", "min"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; + int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32*1024*1024; +static size_t maxBytes = 32*1024*1024; +static size_t stepBytes = 1*1024*1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = AGG_ITERS; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024*1024*1024; + break; + case 'M': + case 'm': + units = 1024*1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch(type) { + case ncclHalf: return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: return 1e-2; +#endif + case ncclFloat: return 1e-5; + case ncclDouble: return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + //case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: return 1e-200; + } + return 1e-200; +} + +template __device__ +double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template<> __device__ +double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y-x)); +} + +template __device__ +float toFloat(T a) { + return (float)a; +} +template<> __device__ +float toFloat(half a) { + return __half2float(a); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> __device__ +float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template __global__ +void deltaKern(void* A_, void* B_, size_t count, double* max) { + const T* A = (const T*)A_; + const T* B = (const T*)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x*blockDim.x + threadIdx.x; + double locmax = 0.0; + for(size_t i=tid; i locmax ) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for(int stride = BSIZE/2; stride > 1; stride>>=1) { + __syncthreads(); + if( tid < stride ) + temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; + } + __syncthreads(); + if( threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep+rank+offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template<> +__device__ double testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(double)testValue(offset, rep, rank)); +} +template<> +__device__ float testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(float)testValue(offset, rep, rank)); +} +template<> +__device__ half testValue(const size_t offset, const int rep, const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template +__device__ T ncclOpSum(T a, T b) { return a+b; } +template +__device__ T ncclOpProd(T a, T b) { return a*b; } +template +__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } +template +__device__ T ncclOpMin(T a, T b) { return a +__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } +template<> +__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } +template<> +__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } +template<> +__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } + +template +__device__ T ncclPPOpIdent(T x, int arg) { return x; } +template +__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } +template +__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } +template<> +__device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x)*float(arg)); +} +template<> +__device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x)/n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x)*float(arg)); +} +template<> +__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x)/n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { + return 1 + rank%2; +} + +template +__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i=1; i(o+offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); +} + +static void* const initDataKerns[ncclNumTypes] = { + (void*)InitDataKernel< int8_t>, + (void*)InitDataKernel< uint8_t>, + (void*)InitDataKernel< int32_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< int64_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< half>, + (void*)InitDataKernel< float>, + (void*)InitDataKernel< double>, +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + (void*)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { + T* ptr = (T*)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs* args) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + args->barrier_idx=!args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs* args, double* value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) val += val2; + if (average == 2) val = std::min(val, val2); + if (average == 3) val = std::max(val, val2); + } + if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; + args->reduce[1-args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx=!args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int* done = (int*)malloc(sizeof(int)*ngpus); + memset(done, 0, sizeof(int)*ngpus); + while (remaining) { + int idle = 1; + for (int i=0; i= NCCL_VERSION(2,4,0) + if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i=0; inbytes / wordSize(type); + + // // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + char *recvBuff = (char *)(args->recvbuffs[iter]); + char *sendBuff = (char *)(args->sendbuffs[iter]); + ncclRedOp_t op; + + if(opIndex < ncclNumOps) { + op = opIndex; + } + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + else { + union { + int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; + half f16; float f32; double f64; + #if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; + #endif + }; + int scalar = preMulScalar(rank); + switch(type) { + case ncclInt8: i8 = int8_t(scalar); break; + case ncclUint8: u8 = uint8_t(scalar); break; + case ncclInt32: i32 = int32_t(scalar); break; + case ncclUint32: u32 = uint32_t(scalar); break; + case ncclInt64: i64 = int32_t(scalar); break; + case ncclUint64: u64 = uint32_t(scalar); break; + case ncclFloat16: f16 = __float2half(float(scalar)); break; + case ncclFloat32: f32 = float(scalar); break; + case ncclFloat64: f64 = double(scalar); break; + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; + #endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); + } + #endif + + TESTCHECK(args->collTest->runColl( + (void*)(sendBuff), + (void*)(recvBuff), + count, type, op, root, args->comms[i], args->streams[i])); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + if(opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); + } + #endif + } + if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd()); + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs* args) { + if (blocking_coll) return testSuccess; + + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { + size_t count = args->nbytes / wordSize(type); + int cudaDev; + cudaGetDevice(&cudaDev); + + Barrier(args); + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < iters; iter++) { + if (agg_iters>1) NCCLCHECK(ncclGroupStart()); + // for (int aiter = 0; aiter < agg_iters; aiter++) { + for (int aiter_idx = 0; aiter_idx < agg_iters; aiter_idx++) { + int aiter = idxList[cudaDev][aiter_idx]; + args->nbytes = sendBytesList[aiter]; + args->sendBytes = args->nbytes; + TESTCHECK(startColl(args, typeList[aiter], op, root, in_place, iter*agg_iters+aiter)); + } + if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); + } + + TESTCHECK(completeColl(args)); + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec/(iters*agg_iters); + if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + Allreduce(args, &deltaSec, average); + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); + + Barrier(args); + + double maxDelta = 0; + static __thread int rep = 0; + rep++; + if (datacheck) { + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec*1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { + int nranks = args->nProcs*args->nGpus*args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { + + // Benchmark + args->nbytes = sendBytesList[0]; + args->sendBytes = args->nbytes; + print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0)); + // TODO: 实测是否恢复? + // TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); + + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs* args) { + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs* args) { + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + //set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i=0; inGpus; i++) { + int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i=0; inGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void* threadLauncher(void* thread_) { + struct testThread* thread = (struct testThread*)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread* thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char* argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + ncclGetVersion(&test_ncclVersion); + #else + test_ncclVersion = NCCL_VERSION_CODE; + #endif + //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { + test_opnum++; // ncclAvg + #if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 + #endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { + test_opnum++; // PreMulSum + } + #endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {} + }; + + while(1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); + + if (c == -1) + break; + + switch(c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, + (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t AllocateBuffLists(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes) { + CUDACHECK(cudaMalloc(sendbuff, sendBytes)); + CUDACHECK(cudaMalloc(recvbuff, recvBytes)); + return testSuccess; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); + for (int p=0; p 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); + if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i=0; i memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); + } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus*nThreads]; + void* sendbuffs[nGpus*nThreads][AGG_ITERS]; + void* recvbuffs[nGpus*nThreads][AGG_ITERS]; + void* expected[nGpus*nThreads]; + // size_t sendBytes, recvBytes; + + // ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); + + ncclTestEngine.getCollByteCountList(sendBytesList, recvBytesList, countList, agg_iters); + + for (int i=0; i coll_id = %d, ALLOCATE sendbuff @ %p, recvbuff @ %p", i, j, sendbuffs[i][j], recvbuffs[i][j]); + } + } + + //if parallel init is not selected, use main thread to initialize NCCL + ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus); + if (!parallel_init) { + if (nProcs == 1) { + int gpuArray[nGpus*nThreads]; + for (int i=0; i=0; t--) { + threads[t].args.minbytes=minBytes; + threads[t].args.maxbytes=maxBytes; + threads[t].args.stepbytes=stepBytes; + threads[t].args.stepfactor=stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs=nProcs; + threads[t].args.proc=proc; + threads[t].args.nThreads=nThreads; + threads[t].args.thread=t; + threads[t].args.nGpus=nGpus; + // threads[t].args.sendbuffs = sendbuffs+t*nGpus; + // threads[t].args.recvbuffs = recvbuffs+t*nGpus; + for (int j = 0; j < AGG_ITERS; j++) { + threads[t].args.sendbuffs[j] = sendbuffs[t][j]; + threads[t].args.recvbuffs[j] = recvbuffs[t][j]; + } + threads[t].args.expected = expected+t*nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms=comms+t*nGpus; + threads[t].args.streams=streams+t*nGpus; + + threads[t].args.barrier = (volatile int*)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double*)reduce; + threads[t].args.sync = (volatile int*)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t*NUM_BLOCKS); + threads[t].args.errors=errors+t; + threads[t].args.bw=bw+t; + threads[t].args.bw_count=bw_count+t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads+t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t=nThreads-1; t>=0; t--) { + if (t) pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for(int i=0; i +#include +#include +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +#include +#include "nccl1_compat.h" + +#define AGG_ITERS 2 + +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test CUDA failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ + return testCudaError; \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ + return testNcclError; \ + } \ +} while(0) + +typedef enum { + testSuccess = 0, + testInternalError = 1, + testCudaError = 2, + testNcclError = 3, +} testResult_t; + +// Relay errors up and trace +#define TESTCHECK(cmd) do { \ + testResult_t r = cmd; \ + if (r!= testSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf(" .. %s pid %d: Test failure %s:%d\n", \ + hostname, getpid(), \ + __FILE__,__LINE__); \ + return r; \ + } \ +} while(0) + +struct testColl { + const char name[20]; + void (*getCollByteCount)( + size_t *sendcount, size_t *recvcount, size_t *paramcount, + size_t *sendInplaceOffset, size_t *recvInplaceOffset, + size_t count, int nranks); + testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, + ncclRedOp_t op, int root, int rep, int in_place); + void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +}; +extern struct testColl allReduceTest; +extern struct testColl allGatherTest; +extern struct testColl reduceScatterTest; +extern struct testColl broadcastTest; +extern struct testColl reduceTest; +extern struct testColl alltoAllTest; + +struct testEngine { + void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); + testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, + const char* typeName, ncclRedOp_t op, const char* opName); + void (*getCollByteCountList)(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen); +}; + +extern struct testEngine ncclTestEngine; + +struct threadArgs { + size_t nbytes; + size_t minbytes; + size_t maxbytes; + size_t stepbytes; + size_t stepfactor; + + int nProcs; + int proc; + int nThreads; + int thread; + int nGpus; + int localRank; + void* sendbuffs[AGG_ITERS]; + size_t sendBytes; + size_t sendInplaceOffset; + void* recvbuffs[AGG_ITERS]; + size_t recvInplaceOffset; + ncclUniqueId ncclId; + ncclComm_t* comms; + cudaStream_t* streams; + + void** expected; + size_t expectedBytes; + volatile int* sync; + int sync_idx; + volatile int* barrier; + int barrier_idx; + volatile double* reduce; + int syncRank; + int syncNranks; + double* deltaHost; + int* errors; + double* bw; + int* bw_count; + + int reportErrors; + + struct testColl* collTest; +}; + +typedef testResult_t (*threadFunc_t)(struct threadArgs* args); +struct testThread { + pthread_t thread; + threadFunc_t func; + struct threadArgs args; + testResult_t ret; +}; + +#include + +// Provided by common.cu +extern void Barrier(struct threadArgs* args); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); +extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); + +// Provided by each coll +extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); +extern void print_header(); + +#include + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + +#include + +static uint64_t getHash(const char* string, size_t n) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (size_t c = 0; c < n; c++) { + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + +static size_t wordSize(ncclDataType_t type) { + switch(type) { + case ncclChar: +#if NCCL_MAJOR >= 2 + //case ncclInt8: + case ncclUint8: +#endif + return 1; + case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif + //case ncclFloat16: + return 2; + case ncclInt: + case ncclFloat: +#if NCCL_MAJOR >= 2 + //case ncclInt32: + case ncclUint32: + //case ncclFloat32: +#endif + return 4; + case ncclInt64: + case ncclUint64: + case ncclDouble: + //case ncclFloat64: + return 8; + default: return 0; + } +} + +extern int test_ncclVersion; // init'd with ncclGetVersion() +constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); +extern int test_opnum; +extern int test_typenum; +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[]; +extern const char *test_opnames[]; + +static int ncclstringtotype(char *str) { + for (int t=0; t +#ifndef NCCL1_COMPAT_H +#define NCCL1_COMPAT_H + +#ifndef NCCL_MAJOR // NCCL 1.x +#define NCCL_MAJOR 1 +#define NCCL_MINOR 0 + +#define ncclNumOps nccl_NUM_OPS +#define ncclNumTypes nccl_NUM_TYPES + +static ncclResult_t ncclGroupStart() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; } +static ncclResult_t ncclGroupEnd() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; } + +#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument; + +static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclBcast(buff, (int)count, datatype, root, comm, stream); +} +static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + CHECKCOUNT(recvcount); + return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(sendcount); + return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif diff --git a/src_simple/Makefile b/src_simple/Makefile new file mode 100644 index 0000000..2206f40 --- /dev/null +++ b/src_simple/Makefile @@ -0,0 +1,109 @@ +# +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +CUDA_HOME ?= /usr/local/cuda +PREFIX ?= /usr/local +VERBOSE ?= 0 +DEBUG_NT ?= 0 + +CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include +NVCC = $(CUDA_HOME)/bin/nvcc +CUDARTLIB ?= cudart + +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_80,code=sm_80 \ +# -gencode=arch=compute_80,code=compute_80 +# else +# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ +# -gencode=arch=compute_50,code=sm_50 \ +# -gencode=arch=compute_60,code=sm_60 \ +# -gencode=arch=compute_61,code=sm_61 \ +# -gencode=arch=compute_70,code=sm_70 \ +# -gencode=arch=compute_70,code=compute_70 +# endif + +CUDA_GENCODE_3080 = -gencode=arch=compute_86,code=sm_86 +CUDA_GENCODE_2080 = -gencode=arch=compute_75,code=sm_75 + +CARDNAME ?= 3080 +ifeq ($(CARDNAME), 3080) +NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE) +else +NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE) +endif +$(info CARDNAME $(CARDNAME)) +$(info NVCC_GENCODE $(NVCC_GENCODE)) + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt + +ifeq ($(DEBUG_NT), 0) +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else +.SILENT: +endif + +.PHONY: build clean + +BUILDDIR ?= ../build +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +ifeq ($(MPI), 1) +NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include +NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi +endif +ifeq ($(MPI_IBM),1) +NVCUFLAGS += -DMPI_SUPPORT +NVLDFLAGS += -lmpi_ibm +endif +LIBRARIES += nccl +NVLDFLAGS += $(LIBRARIES:%=-l%) + +$(info CARDNAME $(NVCUFLAGS)) + +DST_DIR := $(BUILDDIR) +SRC_FILES := $(wildcard *.cu) +OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) +BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter ofccl_reduce ofccl_broadcast +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) + +build: ${BIN_FILES} + +clean: + rm -rf ${DST_DIR} + +${DST_DIR}/%.o: %.cu common_simple.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_simple.o + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu new file mode 100644 index 0000000..fc1d809 --- /dev/null +++ b/src_simple/common_simple.cu @@ -0,0 +1,1534 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "common_simple.h" +#include "cuda.h" +#include "nccl.h" +#include +#include +#include +#include +#include + +int test_ncclVersion = 0; // init'd with ncclGetVersion() + +#if NCCL_MAJOR >= 2 +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, + ncclUint8, + ncclInt32, + ncclUint32, + ncclInt64, + ncclUint64, + ncclHalf, + ncclFloat, + ncclDouble +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclBfloat16 +#endif +}; +const char *test_typenames[ncclNumTypes] = {"int8", + "uint8", + "int32", + "uint32", + "int64", + "uint64", + "half", + "float", + "double" +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + "bfloat16" +#endif +}; +int test_typenum = -1; + +const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; +ncclRedOp_t test_ops[] = { + ncclSum, + ncclProd, + ncclMax, + ncclMin +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + , + ncclAvg +#endif +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + , + ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand +#endif +}; +int test_opnum = -1; +#else +ncclDataType_t test_types[ncclNumTypes] = { + ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; +const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", + "double", "int64", "uint64"}; +int test_typenum = 7; +const char *test_opnames[] = {"sum", "prod", "max", "min"}; +ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; +int test_opnum = 4; +#endif + +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32 * 1024 * 1024; +static size_t maxBytes = 32 * 1024 * 1024; +static size_t stepBytes = 1 * 1024 * 1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int agg_iters = 1; +static int multi_iters = 1; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int parallel_init = 0; +static int blocking_coll = 0; +static int cudaGraphLaunches = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static thread_local CallBackArgs cbArgList[MAX_COLL_NUM]; +static thread_local int seenCqe[MAX_COLL_NUM]; + +// bool StringToInteger(const std::string& str, int64_t* value) { +// char* end; +// int64_t v = std::strtoll(str.data(), &end, 10); +// if (end == str.data()) { +// return false; +// } else { +// *value = v; +// return true; +// } +// } + +// static int64_t ParseIntegerFromEnv(const std::string& env_var, int64_t default_value) { +// const char* env_p = std::getenv(env_var.c_str()); +// if (env_p == nullptr) { return default_value; } +// int64_t value; +// if (StringToInteger(env_p, &value)) { +// return value; +// } else { +// return default_value; +// } +// } + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024 * 1024 * 1024; + break; + case 'M': + case 'm': + units = 1024 * 1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch (type) { + case ncclHalf: + return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + return 1e-2; +#endif + case ncclFloat: + return 1e-5; + case ncclDouble: + return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + // case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: + return 1e-200; + } + return 1e-200; +} + +template __device__ double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template <> __device__ double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y - x)); +} + +template __device__ float toFloat(T a) { return (float)a; } +template <> __device__ float toFloat(half a) { return __half2float(a); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif + +template +__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) { + const T *A = (const T *)A_; + const T *B = (const T *)B_; + __shared__ double temp[BSIZE]; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + double locmax = 0.0; + for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) { + + double delta = absDiff(A[i], B[i]); + if (delta > locmax) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) + printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i, + toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + tid = threadIdx.x; + temp[tid] = locmax; + for (int stride = BSIZE / 2; stride > 1; stride >>= 1) { + __syncthreads(); + if (tid < stride) + temp[tid] = + temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride]; + } + __syncthreads(); + if (threadIdx.x == 0) + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; +} + +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; +#endif + case ncclHalf: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<>>(results, expected, count, devmax); break; + } + CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; i +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep + rank + offset) % 256; + return (T)v; +} + +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template <> +__device__ double testValue(const size_t offset, const int rep, + const int rank) { + return 1.0 / (1.0 + (double)testValue(offset, rep, rank)); +} +template <> +__device__ float testValue(const size_t offset, const int rep, + const int rank) { + // IF_CHECK 如果要检查对错,把第一个return注释掉,露出来第二个。 + return 1.0 / (1.0 + (float)testValue(offset, rep, rank)); + // return 1.0 / 1.0; +} +template <> +__device__ half testValue(const size_t offset, const int rep, + const int rank) { + return __float2half(testValue(offset, rep, rank)); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, + const int rep, + const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif + +// Operations +template __device__ T ncclOpSum(T a, T b) { return a + b; } +template __device__ T ncclOpProd(T a, T b) { return a * b; } +template __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; } +template __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; } + +// Definitions for half +template <> __device__ half ncclOpSum(half a, half b) { + return __float2half(__half2float(a) + __half2float(b)); +} +template <> __device__ half ncclOpProd(half a, half b) { + return __float2half(__half2float(a) * __half2float(b)); +} +template <> __device__ half ncclOpMax(half a, half b) { + return __half2float(a) > __half2float(b) ? a : b; +} +template <> __device__ half ncclOpMin(half a, half b) { + return __half2float(a) < __half2float(b) ? a : b; +} + +template __device__ T ncclPPOpIdent(T x, int arg) { return x; } +template __device__ T ncclPPOpMul(T x, int arg) { + return x * T(arg); +} +template __device__ T ncclPPOpDiv(T x, int arg) { + return x / T(arg); +} +template <> __device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x) * float(arg)); +} +template <> __device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x) / n); +} +#if defined(__CUDA_BF16_TYPES_EXIST__) +template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x) * float(arg)); +} +template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x) / n); +} +#endif + +__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; } + +template +__global__ void InitDataReduceKernel(T *data, const size_t N, + const size_t offset, const int rep, + const int nranks) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) { + T val = testValue(o + offset, rep, 0); + val = PreOp(val, preMulScalar(0)); + for (int i = 1; i < nranks; i++) { + T val1 = testValue(o + offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); + } + data[o] = PostOp(val, nranks); + } +} + +#define KERN(type, op, preop, postop) \ + (void *)InitDataReduceKernel, preop, postop> +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv) +#else +#define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) +#endif + +static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = { + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), + OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + OPS(__nv_bfloat16) +#endif +}; + +testResult_t InitDataReduce(void *data, const size_t count, const size_t offset, + ncclDataType_t type, ncclRedOp_t op, const int rep, + const int nranks) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep, + (void *)&nranks}; + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid, + block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +template +__global__ void InitDataKernel(T *data, const size_t N, const int rep, + const int rank) { + for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N; + o += gridDim.x * blockDim.x) + data[o] = testValue(o, rep, rank); +} + +static void *const initDataKerns[ncclNumTypes] = { + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, (void *)InitDataKernel, + (void *)InitDataKernel, +#if defined(__CUDA_BF16_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + (void *)InitDataKernel<__nv_bfloat16> +#endif +}; + +template +testResult_t InitDataType(void *dest, const size_t N, const int rep, + const int rank) { + T *ptr = (T *)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; +} + +testResult_t InitData(void *data, const size_t count, ncclDataType_t type, + const int rep, const int rank) { + dim3 grid = {32, 1, 1}; + dim3 block = {256, 1, 1}; + void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank}; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs *args) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + args->barrier_idx = !args->barrier_idx; +} + +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs *args, double *value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) + pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) + val += val2; + if (average == 2) + val = std::min(val, val2); + if (average == 3) + val = std::max(val, val2); + } + if (average || args->thread == 0) + args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread + 1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1, + MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) + args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads; + args->reduce[1 - args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) + pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; + args->barrier_idx = !args->barrier_idx; +} + +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); + } + printf("\n"); + + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(expectedHost); + free(dataHost); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + *delta = maxDelta; + return testSuccess; +} + + +testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams, + ncclComm_t *comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int *done = (int *)malloc(sizeof(int) * ngpus); + memset(done, 0, sizeof(int) * ngpus); + while (remaining) { + int idle = 1; + for (int i = 0; i < ngpus; i++) { + if (done[i]) + continue; + + cudaErr = cudaStreamQuery(streams[i]); + if (cudaErr == cudaSuccess) { + done[i] = 1; + remaining--; + idle = 0; + continue; + } + + if (cudaErr != cudaErrorNotReady) + CUDACHECK(cudaErr); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i = 0; i < ngpus; i++) + NCCLCHECK(ncclCommAbort(comms[i])); + // Abort the perf test + NCCLCHECK(ncclAsyncErr); + } + } +#endif + } + + // We might want to let other threads (including NCCL threads) use the CPU. + if (idle) + pthread_yield(); + } + free(done); + return testSuccess; +} + +testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + if (args->nGpus != 1) { + OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs"); + return testInternalError; + } + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + // size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + // size_t shift = totalnbytes * (iter % steps); + + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + TESTCHECK(args->collTest->prepareColl(count, type, op, root, comm, miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + + return testSuccess; +} + +testResult_t startColl(struct threadArgs *args, ncclDataType_t type, + ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) { + size_t count = args->nbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and + // catch race conditions in ptrExchange + size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); + + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart"); + NCCLCHECK(ncclGroupStart()); + } + for (int i = 0; i < args->nGpus; i++) { + ncclComm_t comm = args->comms[miter * nGpus + i]; + // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm); +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(comm, &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i); + char *recvBuff = ((char *)args->recvbuffs[i]) + shift; + char *sendBuff = ((char *)args->sendbuffs[i]) + shift; + ncclRedOp_t op; + + if (opIndex < ncclNumOps) { + op = opIndex; + } +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + else { + union { + int8_t i8; + uint8_t u8; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + half f16; + float f32; + double f64; +#if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; +#endif + }; + int scalar = preMulScalar(rank); + switch (type) { + case ncclInt8: + i8 = int8_t(scalar); + break; + case ncclUint8: + u8 = uint8_t(scalar); + break; + case ncclInt32: + i32 = int32_t(scalar); + break; + case ncclUint32: + u32 = uint32_t(scalar); + break; + case ncclInt64: + i64 = int32_t(scalar); + break; + case ncclUint64: + u64 = uint32_t(scalar); + break; + case ncclFloat16: + f16 = __float2half(float(scalar)); + break; + case ncclFloat32: + f32 = float(scalar); + break; + case ncclFloat64: + f64 = double(scalar); + break; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + bf16 = __float2bfloat16(float(scalar)); + break; +#endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum( + &op, &u64, type, ncclScalarHostImmediate, comm)); + } +#endif + // miter就是collId。 + TESTCHECK(args->collTest->runColl( + (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank + : sendBuff), + (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank + : recvBuff), miter, cbArgList + miter, rankCtx)); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + if (opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, comm)); + } +#endif + } + if (args->nGpus > 1) { + // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd"); + NCCLCHECK(ncclGroupEnd()); + } + + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + } + if (blocking_coll) + Barrier(args); + return testSuccess; +} + +testResult_t completeColl(struct threadArgs *args) { + if (blocking_coll) + return testSuccess; + + + int gotCqeCnt = 0; + while (gotCqeCnt < multi_iters) { + for (int i = 0; i < multi_iters; i++) { + pthread_mutex_lock(&cbArgList[i].mutex); + if (cbArgList[i].gotCqe == 1) { + if (seenCqe[i] == 0) { + gotCqeCnt++; + seenCqe[i] = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // if (cudaDev == 0) { + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i); + // } + + } + } + pthread_mutex_unlock(&cbArgList[i].mutex); + } + } + return testSuccess; +} + +testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) { + + size_t count = args->nbytes / wordSize(type); + + // Sync,参考nccl,把这个也加上吧。 + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, in_place, + 0 * multi_iters + miter, miter, rankCtx)); + } + TESTCHECK(completeColl(args)); + + Barrier(args); + + // int64_t NEW_TIMER = ParseIntegerFromEnv("NEW_TIMER", 0); + // int64_t SHOW_ITER_TIME = ParseIntegerFromEnv("SHOW_ITER_TIME", 0); + + // Performance Benchmark + #ifdef NEW_TIMER + double deltaSec = 0.0; + #else + auto start = std::chrono::high_resolution_clock::now(); + #endif + + for (int iter = 0; iter < iters; iter++) { + + #if defined(NEW_TIMER) || defined(SHOW_ITER_TIME) + auto iter_start = std::chrono::high_resolution_clock::now(); + #endif + + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, in_place, + iter * multi_iters + miter, miter, rankCtx)); + } + + TESTCHECK(completeColl(args)); + + #if defined(NEW_TIMER) || defined(SHOW_ITER_TIME) + auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start; + double iter_deltaSec = std::chrono::duration_cast>(iter_delta).count(); + + int cudaDev; + cudaGetDevice(&cudaDev); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters); + if (cudaDev == 0) + OFTEST_LOG(TEST, "Rank<%d>, iter=%d, time = %lfus", cudaDev, iter, iter_deltaSec * 1.0E6); + #endif + + #ifdef NEW_TIMER + deltaSec += iter_deltaSec; + #endif + } + + #ifndef NEW_TIMER + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = + std::chrono::duration_cast>(delta).count(); + #endif + + deltaSec = deltaSec / (iters * multi_iters); + if (cudaGraphLaunches >= 1) + deltaSec = deltaSec / cudaGraphLaunches; + + #ifdef SHOW_AVG_TIME + int cudaDev; + cudaGetDevice(&cudaDev); + if (cudaDev == 0) + OFTEST_LOG(TEST, "Rank<%d>, time = %lf us, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters); + + // int clockRate; + // cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, cudaDev); + // int memoryClockRate; + // cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, cudaDev); + // OFTEST_LOG(TEST, "Rank<%d>, clockRate = %d, memoryClockRate = %d", cudaDev, clockRate, memoryClockRate); + + // cudaDeviceProp prop; + // cudaGetDeviceProperties(&prop, cudaDev); + // OFTEST_LOG(TEST, "Rank<%d>, prop.clockRate = %d, prop.memoryClockRate = %d", cudaDev, prop.clockRate, prop.memoryClockRate); + #endif + + Allreduce(args, &deltaSec, average); + + double algBw, busBw; + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, + args->nProcs * args->nThreads * args->nGpus); + + Barrier(args); + + ofcclDestroy(rankCtx); + + double maxDelta = 0; + // static __thread int rep = 0; // 为了再次初始化buffer的参数,没用了。 + // rep++; + if (datacheck) { + + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + //aggregate delta from all threads and procs + Allreduce(args, &maxDelta, 3); + } + + double timeUsec = deltaSec * 1.0E6; + char timeStr[100]; + if (timeUsec >= 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec >= 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); + } else { + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + } + + args->bw[0] += busBw; + args->bw_count[0]++; + return testSuccess; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) { + int nranks = args->nProcs * args->nGpus * args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, + recvInplaceOffset; + + count = size / wordSize(type); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, + &sendInplaceOffset, &recvInplaceOffset, + (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type, + const char *typeName, ncclRedOp_t op, const char *opName, + int root, bool is_ofccl) { + // 首先创建ofcclRankCtx_t + int thrdCudaDev; + CUDACHECK(cudaGetDevice(&thrdCudaDev)); + ofcclRankCtx_t rankCtx; + ofcclInitRankCtx(&rankCtx, thrdCudaDev); + + // prepare for all size. op, type traversed in the caller. + // TODO: if we support multi size, each size should use a separate ncclComm + for (size_t size = args->minbytes; size <= args->maxbytes; + size = ((args->stepfactor > 1) ? size * args->stepfactor + : size + args->stepbytes)) { + setupArgs(size, type, args); + for (int miter = 0; miter < multi_iters; miter++) { + TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx)); + } + } + + // 在这里完成check数据的准备; + static __thread int rep = 0; + rep++; + if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。 + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0)); + + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev); + } + + ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里,cheat一下,省下启动kernel的时间。同时配合ofccl里,不要激进地主动退出。 + // ofcclFinalizeRankCtx7StartHostThrds(rankCtx); + + // TODO: if we support multi size, 我们可以对所有size都warm up;或者保留现在的方式,但是要保证选取了正确的comm。 + // warmup还是需要开,不然ofccl性能拉胯。 + setupArgs(args->maxbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + for (int miter = 0; miter < multi_iters; miter++) { + seenCqe[miter] = 0; + TESTCHECK(startColl(args, type, op, root, 0, + iter * multi_iters + miter, miter, rankCtx)); + } + TESTCHECK(completeColl(args)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), thrdCudaDev, iter, multi_iters); + } + + print_line_header(max(args->sendBytes, args->expectedBytes), + args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx)); + // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边,所以没办法在这里通过调用两次BenchTime来先做out-of-place,再做in-place。像这样的话,可以在BenchTime里加个循环。 + PRINT("\n"); + + return testSuccess; +} + +testResult_t threadRunTests(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadRunTests"); + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, + test_typenames[nccltype], + (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; +} + +testResult_t threadInit(struct threadArgs *args) { + // OFTEST_LOG1(TEST, "Enter threadInit"); + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = args->nProcs * args->nThreads * args->nGpus; + + // set main thread again + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { + int rank = args->proc * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + int gpuid = args->localRank * args->nThreads * args->nGpus + + args->thread * args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + TESTCHECK(threadRunTests(args)); + + for (int i = 0; i < args->nGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); + } + return testSuccess; +} + +void *threadLauncher(void *thread_) { + struct testThread *thread = (struct testThread *)thread_; + thread->ret = thread->func(&thread->args); + return NULL; +} +testResult_t threadLaunch(struct testThread *thread) { + pthread_create(&thread->thread, NULL, threadLauncher, thread); + return testSuccess; +} + +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, + size_t recvBytes, void **expected, size_t nbytes, + int nranks) { + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) + CUDACHECK(cudaMalloc(expected, recvBytes)); + return testSuccess; +} + +testResult_t run(); // Main function + +int main(int argc, char *argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0) + ncclGetVersion(&test_ncclVersion); +#else + test_ncclVersion = NCCL_VERSION_CODE; +#endif +// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, +// test_ncclVersion); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) && + test_ncclVersion >= NCCL_VERSION(2, 10, 0)) { + test_opnum++; // ncclAvg +#if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 +#endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) && + test_ncclVersion >= NCCL_VERSION(2, 11, 0)) { + test_opnum++; // PreMulSum + } +#endif + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"multi_iters", required_argument, 0, 'M'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {}}; + + while (1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts, + &longindex); + + if (c == -1) + break; + + switch (c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'M': + multi_iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) + agg_iters = (int)strtol(optarg, NULL, 0); +#else + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && \ + CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA " + "11.3. Ignoring\n"); +#endif + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') + printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-M,--multi_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) + "[-o,--op ] \n\t" +#else + "[-o,--op ] \n\t" +#endif + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" + "[-a,--average <0/1/2/3> report average iteration time " + "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } + if (minBytes > maxBytes) { + fprintf(stderr, + "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, (unsigned long long)maxBytes); + return -1; + } +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); +#endif + TESTCHECK(run()); + return 0; +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), + MPI_BYTE, MPI_COMM_WORLD); + for (int p = 0; p < nProcs; p++) { + if (p == proc) + break; + if (hostHashs[p] == hostHashs[proc]) + localRank++; + } +#endif + is_main_thread = (proc == 0) ? 1 : 0; + + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup " + "iters: %d iters: %d validation: %d \n", + nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1) ? stepFactor : stepBytes, + (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck); + if (blocking_coll) + PRINT("# Blocking Enabled: wait for completion and barrier after each " + "collective \n"); + if (parallel_init) + PRINT("# Parallel Init Enabled: threads call into NcclInitRank " + "concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + size_t maxMem = ~0; + for (int i = 0; i < nThreads * nGpus; i++) { + int cudaDev = localRank * nThreads * nGpus + i; + int rank = proc * nThreads * nGpus + i; + cudaDeviceProp prop; + CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); + len += + snprintf(line + len, MAX_LINE - len, + "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank, + getpid(), hostname, cudaDev, prop.pciBusID, prop.name); + maxMem = std::min(maxMem, prop.totalGlobalMem); + } + +#if MPI_SUPPORT + char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL; + // Gather all output in rank order to root (0) + MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, + MPI_COMM_WORLD); + if (proc == 0) { + for (int p = 0; p < nProcs; p++) + PRINT("%s", lines + MAX_LINE * p); + free(lines); + } + MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD); +#else + PRINT("%s", line); +#endif + + // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for + // the rest. + size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2); + if (maxBytes > memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) + printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", + maxBytes); + } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus * nThreads]; + void *sendbuffs[nGpus * nThreads]; + void *recvbuffs[nGpus * nThreads]; + void *expected[nGpus * nThreads]; + size_t sendBytes, recvBytes; + + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, + (size_t)nProcs * nGpus * nThreads); + + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes, + expected + i, (size_t)maxBytes, + nProcs * nThreads * nGpus)); + CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking)); + } + + // if parallel init is not selected, use main thread to initialize NCCL + // TODO: assign more comms when use multi size. + ncclComm_t *comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + ncclComm_t *adjusted_comms = + (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters); + if (!parallel_init) { + if (nProcs == 1) { + int gpuArray[nGpus * nThreads]; + for (int i = 0; i < nGpus * nThreads; i++) + gpuArray[i] = i; + // OFTEST_LOG1(TEST, "CommInitAll here"); + // use seprate comm + // TODO: we do not support MPI now. + for (int miter = 0; miter < multi_iters; miter++) { + NCCLCHECK( + ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray)); + for (int tid = 0; tid < nThreads; tid++) { + memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus); + } + } + + // for (int miter = 0; miter < multi_iters; miter++) { + // for (int tid = 0; tid < nThreads; tid++) { + // OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus); + // } + // } + // for (int tid = 0; tid < nThreads; tid++) { + // for (int miter = 0; miter < multi_iters; miter++) { + // OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus); + // } + // } + } else { + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < nGpus * nThreads; i++) { + CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i)); + // OFTEST_LOG1(TEST, "CommInitRank here"); + NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId, + proc * nThreads * nGpus + i)); + } + NCCLCHECK(ncclGroupEnd()); + } + } + + int errors[nThreads]; + double bw[nThreads]; + double *delta; + CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS, + cudaHostAllocPortable | cudaHostAllocMapped)); + int bw_count[nThreads]; + for (int t = 0; t < nThreads; t++) { + bw[t] = 0.0; + errors[t] = bw_count[t] = 0; + } + + PRINT("#\n"); + print_header(); + + int *sync = (int *)calloc(2, sizeof(int)); + int *barrier = (int *)calloc(2, sizeof(int)); + double *reduce = (double *)calloc(2, sizeof(double)); + + struct testThread threads[nThreads]; + memset(threads, 0, sizeof(struct testThread) * nThreads); + + for (int t = nThreads - 1; t >= 0; t--) { + threads[t].args.minbytes = minBytes; + threads[t].args.maxbytes = maxBytes; + // TODO: 不支持多个size。 + if (minBytes != maxBytes) { + OFTEST_LOG1(TEST_FATAL, "Only supports single size now"); + return testInternalError; + } + threads[t].args.stepbytes = stepBytes; + threads[t].args.stepfactor = stepFactor; + threads[t].args.localRank = localRank; + + threads[t].args.nProcs = nProcs; + threads[t].args.proc = proc; + threads[t].args.nThreads = nThreads; + threads[t].args.thread = t; + threads[t].args.nGpus = nGpus; + threads[t].args.sendbuffs = sendbuffs + t * nGpus; + threads[t].args.recvbuffs = recvbuffs + t * nGpus; + threads[t].args.expected = expected + t * nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus; + // for (int i = 0; i < multi_iters * nGpus; i++) { + // OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i); + // } + + threads[t].args.streams = streams + t * nGpus; + + threads[t].args.barrier = (volatile int *)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double *)reduce; + threads[t].args.sync = (volatile int *)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaHost = (delta + t * NUM_BLOCKS); + threads[t].args.errors = errors + t; + threads[t].args.bw = bw + t; + threads[t].args.bw_count = bw_count + t; + + threads[t].args.reportErrors = 1; + + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads + t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t = nThreads - 1; t >= 0; t--) { + if (t) + pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + if (!parallel_init) { + for (int i = 0; i < nGpus * nThreads; ++i) + NCCLCHECK(ncclCommDestroy(comms[i])); + free(comms); + } + + // Free off CUDA allocated memory + for (int i = 0; i < nGpus * nThreads; i++) { + if (sendbuffs[i]) + CUDACHECK(cudaFree((char *)sendbuffs[i])); + if (recvbuffs[i]) + CUDACHECK(cudaFree((char *)recvbuffs[i])); + if (datacheck) + CUDACHECK(cudaFree(expected[i])); + } + CUDACHECK(cudaFreeHost(delta)); + + char *str = getenv("NCCL_TESTS_MIN_BW"); + double check_avg_bw = str ? atof(str) : -1; + bw[0] /= bw_count[0]; + + PRINT("# Out of bounds values : %d %s\n", errors[0], + errors[0] ? "FAILED" : "OK"); + PRINT("# Avg bus bandwidth : %g %s\n", bw[0], + check_avg_bw == -1 ? "" + : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK")); + PRINT("#\n"); +#ifdef MPI_SUPPORT + MPI_Finalize(); +#endif + + // 'cuda-memcheck --leak-check full' requires this + cudaDeviceReset(); + + if (errors[0] || bw[0] < check_avg_bw * (0.9)) + exit(EXIT_FAILURE); + else + exit(EXIT_SUCCESS); +} diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h new file mode 100644 index 0000000..daba610 --- /dev/null +++ b/src_simple/common_simple.h @@ -0,0 +1,295 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#include "nccl.h" +#include +#include +#include +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +#include +#include "nccl1_compat.h" + +// 环境变量是方便,但是会多一些判断,可能影响性能。 +// #define DEBUG_PRINT 1 + +// #define NEW_TIMER 1 +// #define SHOW_ITER_TIME 1 +#define SHOW_AVG_TIME 1 + +#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args) +#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__) +#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__) + +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test CUDA failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ + return testCudaError; \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d '%s'\n", \ + hostname, \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ + return testNcclError; \ + } \ +} while(0) + +typedef enum { + testSuccess = 0, + testInternalError = 1, + testCudaError = 2, + testNcclError = 3, +} testResult_t; + +// Relay errors up and trace +#define TESTCHECK(cmd) do { \ + testResult_t r = cmd; \ + if (r!= testSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf(" .. %s pid %d: Test failure %s:%d\n", \ + hostname, getpid(), \ + __FILE__,__LINE__); \ + return r; \ + } \ +} while(0) + +typedef struct { + int collId; + int gotCqe; + // int cqeCnt; + pthread_mutex_t mutex; +} CallBackArgs; + +#define MAX_COLL_NUM 10000 + +struct testColl { + const char name[20]; + void (*getCollByteCount)( + size_t *sendcount, size_t *recvcount, size_t *paramcount, + size_t *sendInplaceOffset, size_t *recvInplaceOffset, + size_t count, int nranks); + testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, + ncclRedOp_t op, int root, int rep, int in_place); + void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx); + testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx); +}; +extern struct testColl allReduceTest; +extern struct testColl allGatherTest; +extern struct testColl reduceScatterTest; +extern struct testColl broadcastTest; +extern struct testColl reduceTest; +extern struct testColl alltoAllTest; + +struct testEngine { + void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); + testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, + const char* typeName, ncclRedOp_t op, const char* opName); +}; + +extern struct testEngine ncclTestEngine; + +struct threadArgs { + size_t nbytes; + size_t minbytes; + size_t maxbytes; + size_t stepbytes; + size_t stepfactor; + + int nProcs; + int proc; + int nThreads; + int thread; + int nGpus; + int localRank; + void** sendbuffs; + size_t sendBytes; + size_t sendInplaceOffset; + void** recvbuffs; + size_t recvInplaceOffset; + ncclUniqueId ncclId; + ncclComm_t* comms; + cudaStream_t* streams; + + void** expected; + size_t expectedBytes; + volatile int* sync; + int sync_idx; + volatile int* barrier; + int barrier_idx; + volatile double* reduce; + int syncRank; + int syncNranks; + double* deltaHost; + int* errors; + double* bw; + int* bw_count; + + int reportErrors; + + struct testColl* collTest; +}; + +typedef testResult_t (*threadFunc_t)(struct threadArgs* args); +struct testThread { + pthread_t thread; + threadFunc_t func; + struct threadArgs args; + testResult_t ret; +}; + +#include + +// Provided by common.cu +extern void Barrier(struct threadArgs* args); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, bool is_ofccl=false); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); +extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); + +// Provided by each coll +extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); +extern void print_header(); + +#include + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + +#include + +static uint64_t getHash(const char* string, size_t n) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (size_t c = 0; c < n; c++) { + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + +static size_t wordSize(ncclDataType_t type) { + switch(type) { + case ncclChar: +#if NCCL_MAJOR >= 2 + //case ncclInt8: + case ncclUint8: +#endif + return 1; + case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif + //case ncclFloat16: + return 2; + case ncclInt: + case ncclFloat: +#if NCCL_MAJOR >= 2 + //case ncclInt32: + case ncclUint32: + //case ncclFloat32: +#endif + return 4; + case ncclInt64: + case ncclUint64: + case ncclDouble: + //case ncclFloat64: + return 8; + default: return 0; + } +} + +extern int test_ncclVersion; // init'd with ncclGetVersion() +constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); +extern int test_opnum; +extern int test_typenum; +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[]; +extern const char *test_opnames[]; + +static int ncclstringtotype(char *str) { + for (int t=0; t INT_MAX) return ncclInvalidArgument; + +static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclBcast(buff, (int)count, datatype, root, comm, stream); +} +static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + CHECKCOUNT(recvcount); + return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(sendcount); + return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif diff --git a/src_simple/ofccl_all_gather.cu b/src_simple/ofccl_all_gather.cu new file mode 100644 index 0000000..6cf8ddf --- /dev/null +++ b/src_simple/ofccl_all_gather.cu @@ -0,0 +1,151 @@ +#include "cuda_runtime.h" +#include "common_simple.h" +#include +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s", size, count, typeName); +} + +void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count/nranks; + *recvcount = (count/nranks)*nranks; + *sendInplaceOffset = count/nranks; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks - 1))/((double)nranks); + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId); + + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + return 0; +} + +testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + NCCLCHECK(ofcclRunAllGather(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllGather for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllGather sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareAllGather(count, datatype, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllGather with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl allGatherTest = { + "AllGather", + AllGatherGetCollByteCount, + AllGatherInitData, + AllGatherGetBw, + AllGatherRunColl, + AllGatherPrepare +}; + +void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllGatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allGatherTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n"); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev); + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); + return 0; +} + +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + + // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs)); + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + + NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl, + AllReducePrepare +}; + +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6i", size, count, typeName, root); +} + +void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId); + + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + return 0; +} + +testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + NCCLCHECK(ofcclRunBroadcast(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunBroadcast for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunBroadcast sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t BroadcastPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareBroadcast(count, datatype, root, comm, collId, rankCtx)); + OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareBroadcast with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl broadcastTest = { + "Broadcast", + BroadcastGetCollByteCount, + BroadcastInitData, + BroadcastGetBw, + BroadcastRunColl, + BroadcastPrepare +}; + +void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + BroadcastGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &broadcastTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + int begin_root, end_root; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; + } + + for (int i=0; i +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s %6i", size, count, typeName, opName, root); +} + +void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); + if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + *algBw = baseBw; + *busBw = baseBw; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId); + + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + return 0; +} + +testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + NCCLCHECK(ofcclRunReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t ReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareReduce(count, datatype, op, root, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareReduce with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl reduceTest = { + "Reduce", + ReduceGetCollByteCount, + ReduceInitData, + ReduceGetBw, + ReduceRunColl, + ReducePrepare +}; + +void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + ReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &reduceTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + int begin_root, end_root; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; + } + + for (int i=0; i +#include +#include +#include + +void print_header() { + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); +} + +void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = (count/nranks)*nranks; + *recvcount = count/nranks; + *sendInplaceOffset = 0; + *recvInplaceOffset = count/nranks; + *paramcount = *recvcount; +} + +testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks - 1))/((double)nranks); + *busBw = baseBw * factor; +} + +int myCallback(int collIdFromCqe, void *args) { + // 不打log把这里删了,不然影响性能。 + // if (collId != collIdFromCqe) { + // // more robust error handle. + // OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId); + // return -1; + // } + pthread_mutex_lock(&(((CallBackArgs *)args)->mutex)); + ((CallBackArgs *)args)->gotCqe = 1; + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // int collId = ((CallBackArgs *)args)->collId; + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId); + + pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex)); + return 0; +} + +testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) { + args->collId = collId; + args->gotCqe = 0; + pthread_mutex_init(&args->mutex, NULL); + NCCLCHECK(ofcclRunReduceScatter(sendbuff, recvbuff, collId, myCallback, args, rankCtx)); + + // int cudaDev; + // CUDACHECK(cudaGetDevice(&cudaDev)); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduceScatter for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args); + // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduceScatter sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff); + + return testSuccess; +} + +testResult_t ReduceScatterPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) { + + NCCLCHECK(ofcclPrepareReduceScatter(count, datatype, op, comm, collId, rankCtx)); + // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareReduceScatter with count=%lu, collId=%d", pthread_self(), count, collId); + return testSuccess; +} + +struct testColl reduceScatterTest = { + "ReduceScatter", + ReduceScatterGetCollByteCount, + ReduceScatterInitData, + ReduceScatterGetBw, + ReduceScatterRunColl, + ReduceScatterPrepare +}; + +void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + ReduceScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &reduceScatterTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i>"+op['nccl_bw_path']) + os.system("echo $(date +%F%n%T)>>"+op['nccl_time_path']) + + + for iter in NCCL_TIER: + # raw data + AR['nccl_rawData'] = NCCL_RES_DIR+"/nccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + AG['nccl_rawData'] = NCCL_RES_DIR+"/nccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + B['nccl_rawData'] = NCCL_RES_DIR+"/nccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + R['nccl_rawData'] = NCCL_RES_DIR+"/nccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + RS['nccl_rawData'] = NCCL_RES_DIR+"/nccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + + if runNcclTest: + for op in [AR,AG,B,R,RS]: + os.system("echo $(date +%F%n%T)>> "+op['nccl_rawData']) + + for a in buffer_sizes: + os.system(op['run']+" -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ op['nccl_rawData']) + + if staticNccl: + for op in [AR,AG,B,R,RS]: + os.system("./nccl/static_nccl.out " +op['nccl_rawData'] +" " +op['nccl_bw_path']) + os.system("./nccl/static_time.out " +op['nccl_rawData'] +" " +op['nccl_time_path']) + + + + if collectNcclResult : + for op in [AR,AG,B,R,RS]: + # bus + op['bwSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + + with open(op['nccl_bw_path']) as f: + content = f.read() + bw = content.split() + + axis_y = buffer_sizes + for a in range(0,25): + op['bwSheet'].write(2+a+cnt*30,0,axis_y[a],style) + # + for k in [0,1,2]: + op['bwSheet'].write(1+cnt*30,1+k,'nccl-algbw'+str(k),style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30,1+k,bw[i+k*50+2],style) + + op['bwSheet'].write(1+cnt*30,12+k,'nccl-busbw'+str(k),style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style) + # avg + op['bwSheet'].write(1+cnt*30, 4, 'avg-algbw',style) + op['bwSheet'].write(1+cnt*30, 15, 'avg-busbw',style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style ) + op['bwSheet'].write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) + + # time + with open(op['nccl_time_path']) as f2: + content2 = f2.read() + times = content2.split() + + op['tmSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + for a in range(0,25): + op['tmSheet'].write(2+a+cnt*30,0,axis_y[a],style) + for k in [0,1,2]: + op['tmSheet'].write(1+cnt*30,1+k,'nccl-'+str(k),style) + for i in range(0,25): + op['tmSheet'].write(2+i+cnt*30,1+k,times[i+k*25+2],style) + # avg + op['tmSheet'].write(1+cnt*30, 4, 'avg-nccl',style) + for i in range(0,25): + op['tmSheet'].write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style) + + + #OFCCL + # 创建存放实验结果的文件夹 + OFCCL_RES_DIR ="./ofccl/test_result_"+DATE+"_"+NCCL_ORDER+"_"+str(MY_NUM_DEV)+"cards" + if not os.path.exists(OFCCL_RES_DIR): + os.makedirs(OFCCL_RES_DIR) + # 统计结果 + AR['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards.txt" + AR['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_order_"+str(MY_NUM_DEV)+"cards.txt" + AR['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_time.txt" + AR['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_order_"+str(MY_NUM_DEV)+"cards_time.txt" + AR['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_QE.txt" + AR['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + AR['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + AG['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards.txt" + AG['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_order_"+str(MY_NUM_DEV)+"cards.txt" + AG['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_time.txt" + AG['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_order_"+str(MY_NUM_DEV)+"cards_time.txt" + AG['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_QE.txt" + AG['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + AG['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + B['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards.txt" + B['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_order_"+str(MY_NUM_DEV)+"cards.txt" + B['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_time.txt" + B['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_order_"+str(MY_NUM_DEV)+"cards_time.txt" + B['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_QE.txt" + B['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + B['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + R['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards.txt" + R['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_order_"+str(MY_NUM_DEV)+"cards.txt" + R['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_time.txt" + R['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_order_"+str(MY_NUM_DEV)+"cards_time.txt" + R['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_QE.txt" + R['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + R['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + RS['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards.txt" + RS['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_order_"+str(MY_NUM_DEV)+"cards.txt" + RS['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_time.txt" + RS['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_order_"+str(MY_NUM_DEV)+"cards_time.txt" + RS['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_QE.txt" + RS['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" + RS['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_totalCnt.txt" + + if staticOfccl: + for op in [AR,AG,B,R,RS]: + os.system("echo $(date +%F%n%T)>>"+op['ofccl_bw_path']) + os.system("echo $(date +%F%n%T)>>"+op['ofccl_tm_path']) + os.system("echo $(date +%F%n%T)>>"+op['ofccl_bw_order_path']) + os.system("echo $(date +%F%n%T)>>"+op['ofccl_tm_order_path']) + if staticOfcclExtral: + for op in [AR,AG,B,R,RS]: + os.system("echo $(date +%F%n%T)>>"+op['ofccl_qe_path']) + os.system("echo $(date +%F%n%T)>>"+op['ofccl_qeOri_path']) + + + for iter in OFCCL_ITER: + # raw data + AR['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + AG['ofccl_rawData'] =OFCCL_RES_DIR+"/ofccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + B['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + R['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + RS['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt" + + if runOfcclTest: + for op in [AR,AG,B,R,RS]: + os.system("echo $(date +%F%n%T)>> "+op['ofccl_rawData']) + for a in buffer_sizes: + os.system(op['runOfccl']+" -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ op['ofccl_rawData']) + if staticOfccl: + for op in [AR,AG,B,R,RS]: + os.system("./ofccl/static_ofccl_bw.out " +op['ofccl_rawData']+" " +op['ofccl_bw_path']) + os.system("./ofccl/static_ofccl_time.out " +op['ofccl_rawData']+" " + op['ofccl_tm_path']) + if staticOfcclExtral: + for op in [AR,AG,B,R,RS]: + os.system("./ofccl/static_ofccl_QE.out " +op['ofccl_rawData']+" " + op['ofccl_qe_path']) + os.system("./ofccl/static_ofccl_QE_ori.out " +op['ofccl_rawData']+" " + op['ofccl_qeOri_path']) + os.system("./ofccl/static_ofccl_totalCnt.out "+op['ofccl_rawData']+" " + op['ofccl_totalCnt_path']) + if staticOfccl: + for op in [AR,AG,B,R,RS]: + os.system("./ofccl/static_ofccl_bw_order.out "+op['ofccl_bw_path']+" "+op['ofccl_bw_order_path']+" "+ str(len(OFCCL_ITER))) + os.system("./ofccl/static_ofccl_tm_order.out "+op['ofccl_tm_path']+" "+op['ofccl_tm_order_path']+" "+ str(len(OFCCL_ITER))) + + + if collectOfcclResult == True: + #bus width + for op in [AR,AG,B,R,RS]: + with open(op['ofccl_bw_order_path']) as f2: + content2 = f2.read() + bw = content2.split() + + for k in [0,1,2]: + op['bwSheet'].write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30,5+k,bw[i+k*50+2],style) + + op['bwSheet'].write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style) + # avg + op['bwSheet'].write(1+cnt*30,8, 'avg-algbw',style) + op['bwSheet'].write(1+cnt*30, 19, 'avg-busbw',style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) + op['bwSheet'].write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) + + # time + with open(op['ofccl_tm_order_path']) as f2: + content2 = f2.read() + times = content2.split() + + for k in [0,1,2]: + op['tmSheet'].write(1+cnt*30,5+k,'ofccl-'+str(k),style) + for i in range(0,25): + op['tmSheet'].write(2+i+cnt*30,5+k,times[i+k*25+2],style) + # avg + op['tmSheet'].write(1+cnt*30, 4+4, 'avg-ofccl',style) + for i in range(0,25): + op['tmSheet'].write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style) + + if collectNcclResult and collectOfcclResult: + for op in [AR,AG,B,R,RS]: + op['bwSheet'].write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style) + op['bwSheet'].write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style) + op['tmSheet'].write(1+cnt*30, 9, 'ofccl-nccl',style) + op['tmSheet'].write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style) + for i in range(0,25): + op['bwSheet'].write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style) + op['bwSheet'].write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style) + op['tmSheet'].write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style ) + op['tmSheet'].write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style ) + + # time 各个列的标题 + if staticOfcclExtral: + for op in [AR,AG,B,R,RS]: + op['tmSheet'].write(1+cnt*30, 13,'nccl IO',style ) + op['tmSheet'].write(1+cnt*30, 14,'nccl kern',style ) + op['tmSheet'].write(1+cnt*30, 15,'ofccl-nccl kern',style ) + op['tmSheet'].write(1+cnt*30, 16,'before after get sqe',style ) + op['tmSheet'].write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style ) + op['tmSheet'].write(1+cnt*30, 18,'before after put cqe',style ) + op['tmSheet'].write(1+cnt*30, 19,'beforeSqe TO afterCqe',style ) + op['tmSheet'].write(1+cnt*30, 20,'occl rank0 time',style ) + op['tmSheet'].write(1+cnt*30, 21,'nccl kern ori',style ) + op['tmSheet'].write(1+cnt*30, 27,'before after get sqe ori',style ) + op['tmSheet'].write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style ) + op['tmSheet'].write(1+cnt*30, 39,'before after put cqe ori',style ) + op['tmSheet'].write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style ) + + y = 64 + for i in range(0,25): + op['tmSheet'].write(2+i+cnt*30,12,y,style) + y = y*2 + + with open(op['ofccl_qe_path']) as f3: + content3 = f3.read() + times = content3.split() + with open(op['ofccl_qeOri_path']) as f4: + content4 = f4.read() + times4 = content4.split() + for i in range(0,25): + op['tmSheet'].write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) + op['tmSheet'].write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style ) + op['tmSheet'].write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style ) + op['tmSheet'].write(2+cnt*30+i,16,times[2+125*cnt+i],style) + op['tmSheet'].write(2+cnt*30+i,17,times[2+125*cnt+25+i],style) + op['tmSheet'].write(2+cnt*30+i,18,times[2+125*cnt+50+i],style) + op['tmSheet'].write(2+cnt*30+i,19,times[2+125*cnt+75+i],style) + op['tmSheet'].write(2+cnt*30+i,20,times[2+125*cnt+100+i],style) + for j in range(0,5): + op['tmSheet'].write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style) + op['tmSheet'].write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style) + op['tmSheet'].write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style) + op['tmSheet'].write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style) + + # cntSheet + op['cntSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style) + axis_y = buffer_sizes + for a in range(0,25): + op['cntSheet'].write(2+a+cnt*30,0,axis_y[a],style) + + op['cntSheet'].write(1+cnt*30,1,"totalCtxSaveCnt_avg",style) + op['cntSheet'].write(1+cnt*30,2,"totalCtxLoadCnt_avg",style) + op['cntSheet'].write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style) + op['cntSheet'].write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style) + op['cntSheet'].write(1+cnt*30,6,"totalCtxSaveCnt",style) + op['cntSheet'].write(1+cnt*30,24,"totalCtxLoadCnt",style) + op['cntSheet'].write(1+cnt*30,42,"totalProgressed7SwithchCnt",style) + op['cntSheet'].write(1+cnt*30,60,"totalUnprogressedQuitCnt",style) + + with open(op['ofccl_totalCnt_path']) as f: + line = f.readline() + # save + for i in range(0,25): + numbers = line.split() + op['cntSheet'].write(i+2+cnt*30,1,numbers[0]) + for j in range(1,len(numbers)): + op['cntSheet'].write(i+2+cnt*30,5+j,numbers[j]) + line = f.readline() + # load + for i in range(0,25): + numbers = line.split() + op['cntSheet'].write(i+2+cnt*30,2,numbers[0]) + for j in range(1,len(numbers)): + op['cntSheet'].write(i+2+cnt*30,23+j,numbers[j]) + line = f.readline() + # totalProgressed7SwithchCnt + for i in range(0,25): + numbers = line.split() + op['cntSheet'].write(i+2+cnt*30,3,numbers[0]) + for j in range(1,len(numbers)): + op['cntSheet'].write(i+2+cnt*30,41+j,numbers[j]) + line = f.readline() + # totalUnprogressedQuitCnt + for i in range(0,25): + numbers = line.split() + op['cntSheet'].write(i+2+cnt*30,4,numbers[0]) + for j in range(1,len(numbers)): + op['cntSheet'].write(i+2+cnt*30,59+j,numbers[j]) + line = f.readline() + + + + cnt = cnt+1 + +# 保存 excel +if collectNcclResult or collectOfcclResult: + table.save(resultXlsName) \ No newline at end of file diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh new file mode 100755 index 0000000..890e045 --- /dev/null +++ b/test_scripts/nccl/run_nccl.sh @@ -0,0 +1,42 @@ +export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib +export NCCL_PROTO=Simple +export NCCL_ALGO=Ring +# export NCCL_MAX_NCHANNELS=1 +# export NCCL_MIN_NCHANNELS=1 +# export NCCL_NTHREADS=64 + +export DATE=221228 +export NCCL_ORDER=1 + +for MY_NUM_DEV in 2 4 8 +do + unset CUDA_VISIBLE_DEVICES + if [ $MY_NUM_DEV = 4 ]; then + export CUDA_VISIBLE_DEVICES=0,1,4,5 + fi + export RES_DIR=result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards + if [ ! -d "$RES_DIR" ]; then + mkdir $RES_DIR + fi + + for n in 5 + do + for w in 2 + do + for m in 1 + do + for iter in 1 + do + export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt" + ## Time + echo $(date +%F%n%T)>> $RES_PATH + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G + do + ## Test + /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH + done + done + done + done + done +done diff --git a/test_scripts/nccl/static_nccl.cpp b/test_scripts/nccl/static_nccl.cpp new file mode 100644 index 0000000..911fd0c --- /dev/null +++ b/test_scripts/nccl/static_nccl.cpp @@ -0,0 +1,42 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector a; + vector b; + string ss="bandwidth"; + string str = "N/A"; + int cnt = 0; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; + + stringstream line; + line << inputLine; + string tmp; + stack ss; + while(line >> tmp){ + ss.push(tmp); + } + ss.pop(); + b.push_back(ss.top()); + ss.pop(); + a.push_back(ss.top()); + + if(++cnt == 25) + break; + } + + for(auto a1:a) + cout< +using namespace std; +int main(int argc,char* argv[]){ + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector a; + vector b; + string ss="bandwidth"; + string str = "N/A"; + int cnt = 0; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; + + stringstream line; + line << inputLine; + string tmp; + stack ss; + while(line >> tmp){ + ss.push(tmp); + } + ss.pop(); + ss.pop(); + ss.pop(); + a.push_back(ss.top()); + + if(++cnt == 25) + break; + } + + for(auto a1:a) + cout< +using namespace std; +int main(int argc,char* argv[]){ + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + int ranks = *(argv[3]) - '0'; + string str; + stringstream ss; + vector a; + vector b; + string line; + // time + getline(cin,line); + + for(int t =0;t < 25;t++){ + for(int i = 0;i < (11+ranks);i++) + getline(cin,line); + + for(int i =0;i < 6;i++) + cin >> str; + + a.push_back(str); + cin >> str; + b.push_back(str); + + + for(int i = 0;i < 4;i++) + getline(cin,line); + + } + for(int i=0;i +using namespace std; +int main(int argc,char* argv[]){ + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + int ranks = *(argv[3]) - '0'; + string str; + stringstream ss; + vector a; + vector b; + string line; + // time + getline(cin,line); + + for(int t =0;t < 25;t++){ + for(int i = 0;i < (11+ranks);i++) + getline(cin,line); + + for(int i =0;i < 5;i++) + cin >> str; + + a.push_back(str); + + for(int i = 0;i < 4;i++) + getline(cin,line); + + } + for(int i=0;i> $RES_PATH + for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G + do + ## Test + /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH + done + done + done + done + done +done diff --git a/test_scripts/ofccl/static_ofccl_QE.cpp b/test_scripts/ofccl/static_ofccl_QE.cpp new file mode 100644 index 0000000..3705bdb --- /dev/null +++ b/test_scripts/ofccl/static_ofccl_QE.cpp @@ -0,0 +1,174 @@ +#include"bits/stdc++.h" +#include +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector time; + vector sqe; + vector beforeCqe; + vector putCqe; + vector afterCqe; + string bw="bandwidth"; + + int cnt = 0; + double sqe_sum = 0; + int sqe_cnt = 0; + + double beforeCqe_sum=0; + int beforeCqe_cnt = 0; + + double putCqe_sum = 0; + int putCqe_cnt = 0; + + double afterCqe_sum = 0; + int afterCqe_cnt = 0; + + while(getline(cin, inputLine)){ + if(inputLine.find(bw,0) != -1){ + // 判断结束一个输出 + // before after get sqe + double sqe_avg = sqe_sum / sqe_cnt; + sqe.push_back(sqe_avg); + sqe_sum = 0; + sqe_cnt =0; + // AfterSqe TO BeforeCqe + double beforeCqe_avg = beforeCqe_sum / beforeCqe_cnt; + beforeCqe.push_back(beforeCqe_avg); + beforeCqe_sum =0; + beforeCqe_cnt =0; + //before after put cqe + double putCqe_avg = putCqe_sum / putCqe_cnt; + putCqe.push_back(putCqe_avg); + putCqe_sum = 0; + putCqe_cnt = 0; + //beforeSqe TO afterCqe + double afterCqe_avg = afterCqe_sum/afterCqe_cnt; + afterCqe.push_back(afterCqe_avg); + afterCqe_sum=0; + afterCqe_cnt=0; + + if(++cnt == 25) + break; + } + // rank0 time + int pos = -1; + if ((pos=inputLine.find("time = ",0) ) != -1){ + pos += 7; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + time.push_back(t); + continue; + } + + // before after get sqe + if ((pos=inputLine.find("before after get sqe AVG",0) ) != -1){ + pos += 27; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + stringstream ss; + double tt; + ss << t; + ss >> tt; + pos=inputLine.find("weight = ",0); + pos +=9; + int count = inputLine[pos] - '0'; + sqe_sum += tt * count; + sqe_cnt += count; + continue; + } + //AfterSqe TO BeforeCqe + if ((pos=inputLine.find("AfterSqe TO BeforeCqe AVG",0) ) != -1){ + pos += 28; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + stringstream ss; + double tt; + ss << t; + ss >> tt; + pos=inputLine.find("weight = ",0); + pos +=9; + int count = inputLine[pos] - '0'; + beforeCqe_sum += tt * count; + beforeCqe_cnt += count; + continue; + } + + //before after put cqe + if ((pos=inputLine.find("before after put cqe AVG ",0) ) != -1){ + pos += 27; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + stringstream ss; + double tt; + ss << t; + ss >> tt; + pos=inputLine.find("weight = ",0); + pos +=9; + int count = inputLine[pos] - '0'; + putCqe_sum += tt * count; + putCqe_cnt += count; + continue; + } + //beforeSqe TO afterCqe + if ((pos=inputLine.find("beforeSqe TO afterCqe AVG = ",0) ) != -1){ + pos += 28; + string t=""; + while(inputLine[pos] != ' '){ + t += inputLine[pos]; + pos++; + } + stringstream ss; + double tt; + ss << t; + ss >> tt; + pos=inputLine.find("weight = ",0); + pos +=9; + int count = inputLine[pos] - '0'; + afterCqe_sum += tt * count; + afterCqe_cnt += count; + continue; + } + + + } + + // before after get sqe + for (auto s:sqe){ + cout << s << endl; + } + cout < +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + + vector sqe_ori; + vector beforeCqe_ori; + vector putCqe_ori; + vector afterCqe_ori; + string bw="bandwidth"; + + + int cnt=0; + while(getline(cin, inputLine)){ + if(inputLine.find(bw,0) != -1){ + // 判断结束一个输出 + // before after get sqe + + if(++cnt == 25) + break; + } + // rank0 time + int pos = -1; + // before after get sqe + if ((pos=inputLine.find("Rank<0> Blk<0> Thrd<0> coll_id = 0, before after get sqe = ",0) ) != -1){ + pos += 58; + string numbers = inputLine.substr(pos); + stringstream ss ; + ss << numbers; + for(int i = 0;i < 5;i++){ + double tmp; + ss >> tmp; + sqe_ori.push_back(tmp); + } + continue; + } + //AfterSqe TO BeforeCqe + if ((pos=inputLine.find("AfterSqe TO BeforeCqe = ",0) ) != -1){ + pos += 24; + string numbers = inputLine.substr(pos); + stringstream ss ; + ss << numbers; + for(int i = 0;i < 5;i++){ + double tmp; + ss >> tmp; + if(tmp > 0.00001) + beforeCqe_ori.push_back(tmp); + } + continue; + } + + //before after put cqe + if ((pos=inputLine.find("before after put cqe = ",0) ) != -1){ + pos += 23; + string numbers = inputLine.substr(pos); + stringstream ss ; + ss << numbers; + for(int i = 0;i < 5;i++){ + double tmp; + ss >> tmp; + if(tmp > 0.00001) + putCqe_ori.push_back(tmp); + } + continue; + } + + //beforeSqe TO afterCqe + if ((pos=inputLine.find("beforeSqe TO afterCqe = ",0) ) != -1){ + pos += 24; + string numbers = inputLine.substr(pos); + stringstream ss ; + ss << numbers; + for(int i = 0;i < 5;i++){ + double tmp; + ss >> tmp; + if(tmp > 0.00001) + afterCqe_ori.push_back(tmp); + } + continue; + } + } + + // before after get sqe + for(int i = 0;i <25;i++){ + for(int j =0;j < 5;j++) + cout< +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector a; + vector b; + string ss="bandwidth"; + string str = "N/A"; + int cnt = 0; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; + + stringstream line; + line << inputLine; + string tmp; + stack ss; + while(line >> tmp){ + ss.push(tmp); + } + ss.pop(); + b.push_back(ss.top()); + ss.pop(); + a.push_back(ss.top()); + + if(++cnt == 25) + break; + } + + for(auto a1:a) + cout<,less>> a(25,priority_queue,less>()); + vector,less>> b(25,priority_queue,less>()); + + + for(int i = 0;i < num;i++){ + for(int j = 0;j < 25;j++){ + double tmp; + cin>>tmp; + a[j].push(tmp); + } + for(int j = 0;j < 25;j++){ + double tmp; + cin>>tmp; + b[j].push(tmp); + } + } + + for(int i = 0;i < num;i++){ + for(int j = 0;j < 25;j++){ + double tmp; + tmp = a[j].top();a[j].pop(); + cout< +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector a; + vector b; + string ss="bandwidth"; + string str = "N/A"; + int cnt = 0; + while(getline(cin, inputLine)){ + if (inputLine.find(str,0) == -1) + continue; + + stringstream line; + line << inputLine; + string tmp; + stack ss; + while(line >> tmp){ + ss.push(tmp); + } + ss.pop(); + ss.pop(); + ss.pop(); + a.push_back(ss.top()); + + if(++cnt == 25) + break; + } + + for(auto a1:a) + cout<,greater>> a(25,priority_queue,greater>()); + + for(int i = 0;i < num;i++){ + for(int j = 0;j < 25;j++){ + double tmp; + cin>>tmp; + a[j].push(tmp); + } + + } + + for(int i = 0;i < num;i++){ + for(int j = 0;j < 25;j++){ + double tmp; + tmp = a[j].top();a[j].pop(); + cout< +using namespace std; +int main(int argc,char* argv[]){ + + + freopen(argv[1],"r",stdin); + freopen(argv[2],"a",stdout); + + string inputLine; + vector> save_ori(25,vector()); + vector> load_ori(25,vector()); + vector> p7s_ori(25,vector()); + vector> quit_ori(25,vector()); + + vector save_avg; + vector load_avg; + vector p7s_avg; + vector quit_avg; + + string bw="bandwidth"; + + int cnt=0; + while(getline(cin, inputLine)){ + if(inputLine.find(bw,0) != -1){ + // 判断结束一个输出 + // save + double sum = accumulate(begin(save_ori[cnt]), end(save_ori[cnt]), 0); + double mean = sum / save_ori[cnt].size(); + save_avg.push_back(mean); + // load + sum = accumulate(begin(load_ori[cnt]), end(load_ori[cnt]),0); + mean = sum / load_ori[cnt].size(); + load_avg.push_back(mean); + // p7s + sum = accumulate(begin(p7s_ori[cnt]), end(p7s_ori[cnt]),0); + mean = sum / p7s_ori[cnt].size(); + p7s_avg.push_back(mean); + // quit + sum = accumulate(begin(quit_ori[cnt]), end(quit_ori[cnt]),0); + mean = sum / quit_ori[cnt].size(); + quit_avg.push_back(mean); + + if(++cnt == 25) + break; + } + + int pos = 0; + // save + while((pos=inputLine.find("totalCtxSaveCnt=",pos) ) != -1){ + pos += 16; + int number = 0; + while(inputLine[pos]>='0' &&inputLine[pos]<='9'){ + number = number*10 + (inputLine[pos]-'0'); + pos++; + } + save_ori[cnt].push_back(number); + } + pos=0; + while((pos=inputLine.find("totalCtxLoadCnt=",pos) ) != -1){ + pos += 16; + int number = 0; + while(inputLine[pos]>='0' &&inputLine[pos]<='9'){ + number = number*10 + (inputLine[pos]-'0'); + pos++; + } + load_ori[cnt].push_back(number); + } + + pos=0; + while((pos=inputLine.find("totalProgressed7SwithchCnt=",pos) ) != -1){ + pos += 27; + int number = 0; + while(inputLine[pos]>='0' &&inputLine[pos]<='9'){ + number = number*10 + (inputLine[pos]-'0'); + pos++; + } + p7s_ori[cnt].push_back(number); + } + + pos=0; + while((pos=inputLine.find("totalUnprogressedQuitCnt=",pos) ) != -1){ + pos += 25; + int number = 0; + while(inputLine[pos]>='0' &&inputLine[pos]<='9'){ + number = number*10 + (inputLine[pos]-'0'); + pos++; + } + quit_ori[cnt].push_back(number); + } + + + } + + + for(int i = 0;i < 25;i++){ + cout << save_avg[i]<<" "; + for(auto num:save_ori[i]) + cout<>c){ + if(c == '!') + break; + flag =true; + flag2 =true; + for(int i =0;i < a.size();i++){ + if( c != a[i]){ + flag = false; + } + if(i < b.size() && c != b[i]){ + flag2 = false; + } + if(flag == false && flag2 == false) + break; + cin >> c; + } + if(flag){ + cnt++; + int tmp = 0; + while( c >= '0' && c<= '9'){ + tmp = tmp*10 + c -'0'; + scanf("%c",&c); + } + sum += tmp; + } + if(flag2){ + cout << (sum * 1.0)/cnt<