diff --git a/.gitignore b/.gitignore
index a0a013e..99f99d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,11 @@
 #
 # See LICENCE.txt for license information
 /build
+
+.clangd
+
+.vscode
+
+*result*/
+*.xls
+*.out
\ No newline at end of file
diff --git a/README.md b/README.md
index bff6433..1c3c505 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ All tests support the same set of arguments :
   * `-n,--iters <iteration count>` number of iterations. Default : 20.
   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-M,--multi_iters <multi seprate ncclComm iteration count>` number of operations with seprate ncclComm in each iteration. Default : 1.
   * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
diff --git a/nccl_test.sh b/nccl_test.sh
new file mode 100644
index 0000000..1435e51
--- /dev/null
+++ b/nccl_test.sh
@@ -0,0 +1,87 @@
+clear
+
+export MY_NUM_DEV=$1
+
+cd /home/panlichen/work2/nccl-tests
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
+
+if [ -z $BINARY ];then
+    BINARY="DEBUG"
+    # BINARY="MS"
+    # BINARY="PERF"
+fi
+
+FUNC=$2
+
+if [ "$FUNC" == "AR" ]; then
+    target="./build/all_reduce_perf"
+elif [ "$FUNC" == "AG" ]; then
+    target="./build/all_gather_perf"
+elif [ "$FUNC" == "RS" ]; then
+    target="./build/reduce_scatter_perf"
+elif [ "$FUNC" == "R" ]; then
+    target="./build/reduce_perf"
+elif [ "$FUNC" == "B" ]; then
+    target="./build/broadcast_perf"
+fi
+
+
+if [ "$BINARY" == "DEBUG" ];then
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export NITER=5
+    export NBYTES=64M
+    export WARMITER=2
+    export MITER=1
+    export CHECK=0
+elif [ "$BINARY" == "PERF" ];then
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export NITER=4
+    export NBYTES=8K
+    export WARMITER=2
+    export MITER=4
+    export CHECK=0
+elif [ "$BINARY" == "MS" ];then
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    # export NITER=200
+    # export SHOW_ALL_PREPARED_COLL=1
+    # export WARMITER=0
+    # export NBYTES=8K
+    # export MITER=4
+fi
+
+export NSYS_FILE="nccl"
+export NCU_FILE="nccl"
+
+if [ -z $RUN_TYPE ];then
+    RUN_TYPE="PURE"
+    # RUN_TYPE="GDB"
+    # RUN_TYPE="NSYS"
+    # RUN_TYPE="NCU"
+fi
+
+if [ "$RUN_TYPE" == "PURE" ];then
+    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER"
+elif [ "$RUN_TYPE" == "GDB" ];then
+    cmd="cuda-gdb $target"
+    # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0
+elif [ "$RUN_TYPE" == "NSYS" ];then
+    cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER"
+elif [ "$RUN_TYPE" == "NCU" ];then
+    # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER"
+    cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER"
+fi
+
+echo cmd=$cmd
+$cmd #> /home/panlichen/work2/ofccl/log/ofccl-2ms-coll-master.log
+
diff --git a/ofccl_test.sh b/ofccl_test.sh
new file mode 100644
index 0000000..1e62664
--- /dev/null
+++ b/ofccl_test.sh
@@ -0,0 +1,166 @@
+clear
+
+export MY_NUM_DEV=$1
+
+export DEBUG_CC=1
+export DEBUG_ENQ=1
+
+unset DEBUG_CC
+unset DEBUG_ENQ
+
+export DEBUG_NT=1
+unset DEBUG_NT
+
+cd /home/panlichen/work2/nccl-tests
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
+
+export CHECK=0
+export SHOW_ALL_PREPARED_COLL=0
+
+export TRAVERSE_TIMES=10
+export TOLERANT_UNPROGRESSED_CNT=10000
+export BASE_CTX_SWITCH_THRESHOLD=80
+export BOUNS_SWITCH_4_PROCESSED_COLL=0
+export DEV_TRY_ROUND=10
+export CHECK_REMAINING_SQE_INTERVAL=10000
+export DEBUG_FILE="/home/panlichen/work2/ofccl/log/oneflow_cpu_rank_"
+
+rm -rf /home/panlichen/work2/ofccl/log
+mkdir -p /home/panlichen/work2/ofccl/log
+
+# export ENABLE_VQ=1 # volunteer quit
+# export TOLERANT_FAIL_CHECK_SQ_CNT=5000
+# export CNT_BEFORE_QUIT=5
+
+echo TRAVERSE_TIMES=$TRAVERSE_TIMES
+echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
+echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
+echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL
+echo DEV_TRY_ROUND=$DEV_TRY_ROUND
+echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL
+echo DEBUG_FILE=$DEBUG_FILE
+
+if [ ! -z $ENABLE_VQ ];then
+    echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT
+    echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT
+fi
+
+FUNC=$2
+if [ -z $FUNC ]; then
+    FUNC="AR"
+fi
+
+if [ "$FUNC" == "AR" ]; then
+    target="./build/ofccl_all_reduce_perf"
+elif [ "$FUNC" == "AG" ]; then
+    target="./build/ofccl_all_gather_perf"
+elif [ "$FUNC" == "RS" ]; then
+    target="./build/ofccl_reduce_scatter_perf"
+elif [ "$FUNC" == "R" ]; then
+    target="./build/ofccl_reduce_perf"
+elif [ "$FUNC" == "B" ]; then
+    target="./build/ofccl_broadcast_perf"
+fi
+
+if [ -z $BINARY ];then
+    BINARY="DEBUG"
+    # BINARY="MS"
+    # BINARY="PERF"
+fi
+
+if [ "$BINARY" == "DEBUG" ];then
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    if [ $MY_NUM_DEV = 2 ]; then
+        export CUDA_VISIBLE_DEVICES=4,5
+    fi
+    export NITER=5
+    export NBYTES=64M
+    export WARMITER=2
+    export MITER=1
+elif [ "$BINARY" == "PERF" ];then
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export NITER=8
+    export NBYTES=8K
+    export WARMITER=2
+    export MITER=1
+elif [ "$BINARY" == "MS" ];then
+    target="./build/ofccl_all_reduce_ms_perf"
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export NITER=200
+    export SHOW_ALL_PREPARED_COLL=1
+    export WARMITER=0
+    export NBYTES=8K
+    export MITER=4
+    export CHECK=0
+fi
+
+export NSYS_FILE="ofccl"
+export NCU_FILE="ofccl"
+
+if [ -z $RUN_TYPE ];then
+    RUN_TYPE="PURE"
+    # RUN_TYPE="GDB"
+    # RUN_TYPE="NSYS"
+    # RUN_TYPE="NCU"
+fi
+
+# typedef enum { ncclInt8       = 0, ncclChar       = 0,
+#                ncclUint8      = 1,
+#                ncclInt32      = 2, ncclInt        = 2,
+#                ncclUint32     = 3,
+#                ncclInt64      = 4,
+#                ncclUint64     = 5,
+#                ncclFloat16    = 6, ncclHalf       = 6,
+#                ncclFloat32    = 7, ncclFloat      = 7,
+#                ncclFloat64    = 8, ncclDouble     = 8,
+# #if defined(__CUDA_BF16_TYPES_EXIST__)
+#                ncclBfloat16   = 9,
+#                ncclNumTypes   = 10
+# #else
+#                ncclNumTypes   = 9
+# #endif
+# } ncclDataType_t;
+
+# 用这个：
+# const char *test_typenames[ncclNumTypes] = {"int8",
+#                                             "uint8",
+#                                             "int32",
+#                                             "uint32",
+#                                             "int64",
+#                                             "uint64",
+#                                             "half",
+#                                             "float",
+#                                             "double"
+# #if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+#     NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+#                                             ,
+#                                             "bfloat16"
+# #endif
+# };
+
+if [ "$RUN_TYPE" == "PURE" ];then
+    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" #  -d half
+elif [ "$RUN_TYPE" == "GDB" ];then
+    cmd="cuda-gdb $target"
+    # set args -b 64 -e 64 -f 2 -t 2 -g 1 -n 1 -w 0 -c 0
+elif [ "$RUN_TYPE" == "NSYS" ];then
+    cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+elif [ "$RUN_TYPE" == "NCU" ];then
+    # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+    cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+fi
+
+echo cmd=$cmd
+$cmd #> /home/panlichen/work2/ofccl/log/ofccl.log
+
diff --git a/src/Makefile b/src/Makefile
index 2a399db..5927cc2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -7,7 +7,7 @@
 CUDA_HOME ?= /usr/local/cuda
 PREFIX ?= /usr/local
 VERBOSE ?= 0
-DEBUG ?= 0
+DEBUG_NT ?= 0
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -19,27 +19,39 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
-NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_80,code=sm_80 \
-                -gencode=arch=compute_80,code=compute_80
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
 else
-NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_70,code=compute_70
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
 endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
 
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 
 LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
 
-ifeq ($(DEBUG), 0)
+ifeq ($(DEBUG_NT), 0)
 NVCUFLAGS += -O3 -g
 CXXFLAGS  += -O3 -g
 else
@@ -72,6 +84,8 @@ endif
 LIBRARIES += nccl
 NVLDFLAGS += $(LIBRARIES:%=-l%)
 
+$(info CARDNAME $(NVCUFLAGS))
+
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
diff --git a/src/common.cu b/src/common.cu
index 05f814d..fea29f0 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -590,7 +590,7 @@ testResult_t completeColl(struct threadArgs* args) {
 
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
-  if (datacheck) {
+  if (datacheck) { // 这里的目的应该是让测带宽跑的coll也使用非0数据。
     // Initialize sendbuffs, recvbuffs and expected
     TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
   }
@@ -652,6 +652,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
   deltaSec = deltaSec/(iters*agg_iters);
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+  // int cudaDev;
+  // cudaGetDevice(&cudaDev);
+  // OFTEST_LOG(TEST, "Rank<%d>, time = %lfus", cudaDev, deltaSec * 1.0E6);
   Allreduce(args, &deltaSec, average);
 
 #if CUDART_VERSION >= 11030
@@ -732,11 +735,13 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   } else {
     sprintf(timeStr, "%7.2f", timeUsec);
   }
-  if (datacheck) {
-     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
-  } else {
-     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
-  }
+  #ifndef NCCL_DEBUG_CLOCK
+    if (datacheck) {
+      PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+    } else {
+      PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+    }
+  #endif
 
   args->bw[0] += busBw;
   args->bw_count[0]++;
@@ -775,9 +780,12 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   // Benchmark
   for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
       setupArgs(size, type, args);
-      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+
+      #ifndef NCCL_DEBUG_CLOCK
+        print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+      #endif
       TESTCHECK(BenchTime(args, type, op, root, 0));
-      TESTCHECK(BenchTime(args, type, op, root, 1));
+      // TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
   }
   return testSuccess;
@@ -1027,13 +1035,16 @@ testResult_t run() {
 #endif
   is_main_thread = (proc == 0) ? 1 : 0;
 
-  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
-      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
-  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
-  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
-  PRINT("#\n");
+  #ifndef NCCL_DEBUG_CLOCK
+    PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+    if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+    if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+    PRINT("#\n");
+
+    PRINT("# Using devices\n");
+  #endif
 
-  PRINT("# Using devices\n");
 #define MAX_LINE 2048
   char line[MAX_LINE];
   int len = 0;
@@ -1048,20 +1059,21 @@ testResult_t run() {
     maxMem = std::min(maxMem, prop.totalGlobalMem);
   }
 
-#if MPI_SUPPORT
-  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
-  // Gather all output in rank order to root (0)
-  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
-  if (proc == 0) {
-    for (int p = 0; p < nProcs; p++)
-      PRINT("%s", lines+MAX_LINE*p);
-    free(lines);
-  }
-  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
-#else
-  PRINT("%s", line);
+#ifndef NCCL_DEBUG_CLOCK
+  #if MPI_SUPPORT
+    char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+    // Gather all output in rank order to root (0)
+    MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+    if (proc == 0) {
+      for (int p = 0; p < nProcs; p++)
+        PRINT("%s", lines+MAX_LINE*p);
+      free(lines);
+    }
+    MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+  #else
+    PRINT("%s", line);
+  #endif
 #endif
-
   // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
   size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
   if (maxBytes > memMaxBytes) {
@@ -1118,8 +1130,10 @@ testResult_t run() {
     errors[t] = bw_count[t] = 0;
   }
 
-  PRINT("#\n");
-  print_header();
+  #ifndef NCCL_DEBUG_CLOCK
+    PRINT("#\n");
+    print_header();
+  #endif
 
   int* sync = (int*)calloc(2, sizeof(int));
   int* barrier = (int*)calloc(2, sizeof(int));
@@ -1199,9 +1213,14 @@ testResult_t run() {
   double check_avg_bw = str ? atof(str) : -1;
   bw[0] /= bw_count[0];
 
-  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
-  PRINT("#\n");
+  #ifndef NCCL_DEBUG_CLOCK
+    PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+    PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+    PRINT("#\n");
+  #else
+    PRINT("\n");
+    PRINT("\n");
+  #endif
 #ifdef MPI_SUPPORT
   MPI_Finalize();
 #endif
diff --git a/src/common.h b/src/common.h
index bd84d01..a6703b2 100644
--- a/src/common.h
+++ b/src/common.h
@@ -16,6 +16,10 @@
 #include <pthread.h>
 #include "nccl1_compat.h"
 
+#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
+
+// #define NCCL_DEBUG_CLOCK 1
+
 #define CUDACHECK(cmd) do {                         \
   cudaError_t err = cmd;                            \
   if( err != cudaSuccess ) {                        \
diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h
index 020a4bc..32f04e6 100644
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
@@ -3,7 +3,7 @@
  *
  * See LICENSE.txt for license information
  ************************************************************************/
-
+#include <stdio.h>
 #ifndef NCCL1_COMPAT_H
 #define NCCL1_COMPAT_H
 
@@ -14,8 +14,8 @@
 #define ncclNumOps nccl_NUM_OPS
 #define ncclNumTypes nccl_NUM_TYPES
 
-static ncclResult_t ncclGroupStart() { return ncclSuccess; }
-static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+static ncclResult_t ncclGroupStart() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; }
 
 #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
 
diff --git a/src_inplace/Makefile b/src_inplace/Makefile
new file mode 100644
index 0000000..840c997
--- /dev/null
+++ b/src_inplace/Makefile
@@ -0,0 +1,109 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG_NT ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC = $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
+else
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
+endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG_NT), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
+endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
+LIBRARIES += nccl
+NVLDFLAGS += $(LIBRARIES:%=-l%)
+
+$(info CARDNAME $(NVCUFLAGS))
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := ofccl_all_reduce_inp
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+${DST_DIR}/%.o: %.cu common_inplace.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_inplace.o
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/src_inplace/common_inplace.cu b/src_inplace/common_inplace.cu
new file mode 100644
index 0000000..22cfecb
--- /dev/null
+++ b/src_inplace/common_inplace.cu
@@ -0,0 +1,1477 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_inplace.h"
+#include "cuda.h"
+#include "nccl.h"
+#include <cstdio>
+#include <cstring>
+#include <getopt.h>
+#include <libgen.h>
+#include <pthread.h>
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+#if NCCL_MAJOR >= 2
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8,
+                                           ncclUint8,
+                                           ncclInt32,
+                                           ncclUint32,
+                                           ncclInt64,
+                                           ncclUint64,
+                                           ncclHalf,
+                                           ncclFloat,
+                                           ncclDouble
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                           ,
+                                           ncclBfloat16
+#endif
+};
+const char *test_typenames[ncclNumTypes] = {"int8",
+                                            "uint8",
+                                            "int32",
+                                            "uint32",
+                                            "int64",
+                                            "uint64",
+                                            "half",
+                                            "float",
+                                            "double"
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                            ,
+                                            "bfloat16"
+#endif
+};
+int test_typenum = -1;
+
+const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+ncclRedOp_t test_ops[] = {
+    ncclSum,
+    ncclProd,
+    ncclMax,
+    ncclMin
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    ,
+    ncclAvg
+#endif
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    ,
+    ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+#endif
+};
+int test_opnum = -1;
+#else
+ncclDataType_t test_types[ncclNumTypes] = {
+    ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+const char *test_typenames[ncclNumTypes] = {"char",   "int",   "half",  "float",
+                                            "double", "int64", "uint64"};
+int test_typenum = 7;
+const char *test_opnames[] = {"sum", "prod", "max", "min"};
+ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32 * 1024 * 1024;
+static size_t maxBytes = 32 * 1024 * 1024;
+static size_t stepBytes = 1 * 1024 * 1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int multi_iters = 1;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static thread_local CallBackArgs cbArgList[MAX_COLL_NUM];
+static thread_local int seenCqe[MAX_COLL_NUM];
+
+static double parsesize(const char *value) {
+  long long int units;
+  double size;
+  char size_lit;
+
+  int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+  switch (count) {
+  case 2:
+    switch (size_lit) {
+    case 'G':
+    case 'g':
+      units = 1024 * 1024 * 1024;
+      break;
+    case 'M':
+    case 'm':
+      units = 1024 * 1024;
+      break;
+    case 'K':
+    case 'k':
+      units = 1024;
+      break;
+    default:
+      return -1.0;
+    };
+    break;
+  case 1:
+    units = 1;
+    break;
+  default:
+    return -1.0;
+  }
+
+  return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch (type) {
+  case ncclHalf:
+    return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+    return 1e-2;
+#endif
+  case ncclFloat:
+    return 1e-5;
+  case ncclDouble:
+    return 1e-12;
+  case ncclInt:
+#if NCCL_MAJOR >= 2
+  case ncclUint8:
+  // case ncclInt32:
+  case ncclUint32:
+#endif
+  case ncclInt64:
+  case ncclUint64:
+    return 1e-200;
+  }
+  return 1e-200;
+}
+
+template <typename T> __device__ double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template <> __device__ double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y - x));
+}
+
+template <typename T> __device__ float toFloat(T a) { return (float)a; }
+template <> __device__ float toFloat(half a) { return __half2float(a); }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template <typename T, int BSIZE>
+__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) {
+  const T *A = (const T *)A_;
+  const T *B = (const T *)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if (delta > locmax) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1)
+        printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i,
+               toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for (int stride = BSIZE / 2; stride > 1; stride >>= 1) {
+    __syncthreads();
+    if (tid < stride)
+      temp[tid] =
+          temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride];
+  }
+  __syncthreads();
+  if (threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template <typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep + rank + offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template <>
+__device__ double testValue<double>(const size_t offset, const int rep,
+                                    const int rank) {
+  return 1.0 / (1.0 + (double)testValue<int>(offset, rep, rank));
+}
+template <>
+__device__ float testValue<float>(const size_t offset, const int rep,
+                                  const int rank) {
+  // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
+  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  // return 1.0 / 1.0;
+}
+template <>
+__device__ half testValue<half>(const size_t offset, const int rep,
+                                const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset,
+                                                  const int rep,
+                                                  const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template <typename T> __device__ T ncclOpSum(T a, T b) { return a + b; }
+template <typename T> __device__ T ncclOpProd(T a, T b) { return a * b; }
+template <typename T> __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; }
+template <typename T> __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; }
+
+// Definitions for half
+template <> __device__ half ncclOpSum(half a, half b) {
+  return __float2half(__half2float(a) + __half2float(b));
+}
+template <> __device__ half ncclOpProd(half a, half b) {
+  return __float2half(__half2float(a) * __half2float(b));
+}
+template <> __device__ half ncclOpMax(half a, half b) {
+  return __half2float(a) > __half2float(b) ? a : b;
+}
+template <> __device__ half ncclOpMin(half a, half b) {
+  return __half2float(a) < __half2float(b) ? a : b;
+}
+
+template <typename T> __device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template <typename T> __device__ T ncclPPOpMul(T x, int arg) {
+  return x * T(arg);
+}
+template <typename T> __device__ T ncclPPOpDiv(T x, int arg) {
+  return x / T(arg);
+}
+template <> __device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x) * float(arg));
+}
+template <> __device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x) / n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x) * float(arg));
+}
+template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x) / n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; }
+
+template <typename T, T (*Op)(T, T), T (*PreOp)(T, int), T (*PostOp)(T, int)>
+__global__ void InitDataReduceKernel(T *data, const size_t N,
+                                     const size_t offset, const int rep,
+                                     const int nranks) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x) {
+    T val = testValue<T>(o + offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i = 1; i < nranks; i++) {
+      T val1 = testValue<T>(o + offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop)                                          \
+  (void *)InitDataReduceKernel<type, op<type>, preop<type>, postop<type>>
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv),               \
+      KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = {
+    OPS(int8_t),       OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t),
+    OPS(uint64_t),     OPS(half),    OPS(float),   OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void *data, const size_t count, const size_t offset,
+                            ncclDataType_t type, ncclRedOp_t op, const int rep,
+                            const int nranks) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep,
+                   (void *)&nranks};
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid,
+                             block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template <typename T>
+__global__ void InitDataKernel(T *data, const size_t N, const int rep,
+                               const int rank) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void *const initDataKerns[ncclNumTypes] = {
+    (void *)InitDataKernel<int8_t>,       (void *)InitDataKernel<uint8_t>,
+    (void *)InitDataKernel<int32_t>,      (void *)InitDataKernel<uint32_t>,
+    (void *)InitDataKernel<int64_t>,      (void *)InitDataKernel<uint64_t>,
+    (void *)InitDataKernel<half>,         (void *)InitDataKernel<float>,
+    (void *)InitDataKernel<double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    (void *)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template <typename T>
+testResult_t InitDataType(void *dest, const size_t N, const int rep,
+                          const int rank) {
+  T *ptr = (T *)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void *data, const size_t count, ncclDataType_t type,
+                      const int rep, const int rank) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank};
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs *args) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  args->barrier_idx = !args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs *args, double *value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1)
+      val += val2;
+    if (average == 2)
+      val = std::min(val, val2);
+    if (average == 3)
+      val = std::max(val, val2);
+  }
+  if (average || args->thread == 0)
+    args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1,
+                    MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1)
+      args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads;
+    args->reduce[1 - args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx = !args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams,
+                                   ncclComm_t *comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int *done = (int *)malloc(sizeof(int) * ngpus);
+  memset(done, 0, sizeof(int) * ngpus);
+  while (remaining) {
+    int idle = 1;
+    for (int i = 0; i < ngpus; i++) {
+      if (done[i])
+        continue;
+
+      cudaErr = cudaStreamQuery(streams[i]);
+      if (cudaErr == cudaSuccess) {
+        done[i] = 1;
+        remaining--;
+        idle = 0;
+        continue;
+      }
+
+      if (cudaErr != cudaErrorNotReady)
+        CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+      if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) {
+        ncclResult_t ncclAsyncErr;
+        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+        if (ncclAsyncErr != ncclSuccess) {
+          // An asynchronous error happened. Stop the operation and destroy
+          // the communicator
+          for (int i = 0; i < ngpus; i++)
+            NCCLCHECK(ncclCommAbort(comms[i]));
+          // Abort the perf test
+          NCCLCHECK(ncclAsyncErr);
+        }
+      }
+#endif
+    }
+
+    // We might want to let other threads (including NCCL threads) use the CPU.
+    if (idle)
+      pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+  if (args->nGpus != 1) {
+    OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs");
+    return testInternalError;
+  }
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
+
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    ncclRedOp_t op;
+    
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart");
+    NCCLCHECK(ncclGroupStart());
+  }
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm);
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(comm, &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    char *recvBuff = ((char *)args->recvbuffs[i]) + shift;
+    char *sendBuff = ((char *)args->sendbuffs[i]) + shift;
+    ncclRedOp_t op;
+
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    // miter就是collId。
+    TESTCHECK(args->collTest->runColl(
+        (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank
+                          : sendBuff),
+        (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank
+                          : recvBuff), miter, cbArgList + miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd");
+    NCCLCHECK(ncclGroupEnd());
+  }
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll)
+    Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs *args) {
+  if (blocking_coll)
+    return testSuccess;
+    
+  
+  int gotCqeCnt = 0;
+  while (gotCqeCnt < multi_iters) {
+    for (int i = 0; i < multi_iters; i++) {
+      pthread_mutex_lock(&cbArgList[i].mutex);
+      if (cbArgList[i].gotCqe == 1) {
+        if (seenCqe[i] == 0) {
+          gotCqeCnt++;
+          seenCqe[i] = 1;
+          
+          // int cudaDev;
+          // CUDACHECK(cudaGetDevice(&cudaDev));
+          // if (cudaDev == 0) {
+          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i);
+          // }
+
+        }
+      }
+      pthread_mutex_unlock(&cbArgList[i].mutex);
+    }
+  }
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) {
+
+  size_t count = args->nbytes / wordSize(type);
+
+  Barrier(args);
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < iters; iter++) {
+
+    for (int miter = 0; miter < multi_iters; miter++) {
+      seenCqe[miter] = 0;
+      TESTCHECK(startColl(args, type, op, root, in_place,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+
+    TESTCHECK(completeColl(args));
+
+    int cudaDev;
+    cudaGetDevice(&cudaDev);
+    OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+  }
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec =
+      std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec / (iters * agg_iters *multi_iters);
+  if (cudaGraphLaunches >= 1)
+    deltaSec = deltaSec / cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw,
+                        args->nProcs * args->nThreads * args->nGpus);
+
+  Barrier(args);
+
+  ofcclDestroy(rankCtx);
+
+  double maxDelta = 0;
+  // static __thread int rep = 0; // 为了再次初始化buffer的参数，没用了。
+  // rep++;
+  if (datacheck) {
+
+    TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+    //aggregate delta from all threads and procs
+    Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec * 1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+    PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) {
+  int nranks = args->nProcs * args->nGpus * args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset,
+      recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount,
+                                   &sendInplaceOffset, &recvInplaceOffset,
+                                   (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
+                      const char *typeName, ncclRedOp_t op, const char *opName,
+                      int root, bool is_ofccl) {
+  // if (is_ofccl) {
+  // 首先创建ofcclRankCtx_t
+  int thrdCudaDev;
+  CUDACHECK(cudaGetDevice(&thrdCudaDev));
+  ofcclRankCtx_t rankCtx;
+  ofcclInitRankCtx(&rankCtx, thrdCudaDev);
+
+  // prepare for all size. op, type traversed in the caller.
+  // TODO: if we support multi size, each size should use a separate ncclComm
+  for (size_t size = args->minbytes; size <= args->maxbytes;
+      size = ((args->stepfactor > 1) ? size * args->stepfactor
+                                      : size + args->stepbytes)) {
+    setupArgs(size, type, args);
+    for (int miter = 0; miter < multi_iters; miter++) {
+      TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx));
+    }
+  }
+
+  // 在这里完成check数据的准备；
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
+    
+    // int cudaDev;
+    // CUDACHECK(cudaGetDevice(&cudaDev));
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), cudaDev);
+  }
+  
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
+  // ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
+  // }
+
+  // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
+  // warmup还是需要开，不然ofccl性能拉胯。
+  setupArgs(args->maxbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    for (int miter = 0; miter < multi_iters; miter++) {
+      seenCqe[miter] = 0;
+      TESTCHECK(startColl(args, type, op, root, 0,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+    TESTCHECK(completeColl(args));
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), cudaDev, iter, multi_iters);
+  }
+
+  // Benchmark
+  // for (size_t size = args->minbytes; size <= args->maxbytes;
+  //      size = ((args->stepfactor > 1) ? size * args->stepfactor
+  //                                     : size + args->stepbytes)) {
+  // setupArgs(size, type, args);
+  print_line_header(max(args->sendBytes, args->expectedBytes),
+                    args->nbytes / wordSize(type), typeName, opName, root);
+  // TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
+  TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边，所以没办法在这里通过调用两次BenchTime来先做out-of-place，再做in-place。像这样的话，可以在BenchTime里加个循环。
+  PRINT("\n");
+  // }
+
+  // if (is_ofccl) {
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self());
+  // ofcclDestroy(rankCtx); // 为了做check，把这个挪到BenchTime里边。
+  // }
+
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadRunTests");
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank * args->nThreads * args->nGpus +
+              args->thread * args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype,
+                                   test_typenames[nccltype],
+                                   (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadInit");
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks = args->nProcs * args->nThreads * args->nGpus;
+
+  // set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+    int rank = args->proc * args->nThreads * args->nGpus +
+               args->thread * args->nGpus + i;
+    int gpuid = args->localRank * args->nThreads * args->nGpus +
+                args->thread * args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    // OFTEST_LOG1(TEST, "CommInitRank here");
+    NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i = 0; i < args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void *threadLauncher(void *thread_) {
+  struct testThread *thread = (struct testThread *)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread *thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff,
+                           size_t recvBytes, void **expected, size_t nbytes,
+                           int nranks) {
+  CUDACHECK(cudaMalloc(sendbuff, nbytes));
+  // CUDACHECK(cudaMalloc(recvbuff, nbytes));
+  if (datacheck)
+    CUDACHECK(cudaMalloc(expected, recvBytes));
+  return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char *argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+  ncclGetVersion(&test_ncclVersion);
+#else
+  test_ncclVersion = NCCL_VERSION_CODE;
+#endif
+// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE,
+// test_ncclVersion);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0)
+  test_opnum = 4;
+  test_typenum = 9;
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 10, 0)) {
+    test_opnum++; // ncclAvg
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    test_typenum++; // bfloat16
+#endif
+  }
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 11, 0)) {
+    test_opnum++; // PreMulSum
+  }
+#endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+      {"nthreads", required_argument, 0, 't'},
+      {"ngpus", required_argument, 0, 'g'},
+      {"minbytes", required_argument, 0, 'b'},
+      {"maxbytes", required_argument, 0, 'e'},
+      {"stepbytes", required_argument, 0, 'i'},
+      {"stepfactor", required_argument, 0, 'f'},
+      {"iters", required_argument, 0, 'n'},
+      {"agg_iters", required_argument, 0, 'm'},
+      {"multi_iters", required_argument, 0, 'M'},
+      {"warmup_iters", required_argument, 0, 'w'},
+      {"parallel_init", required_argument, 0, 'p'},
+      {"check", required_argument, 0, 'c'},
+      {"op", required_argument, 0, 'o'},
+      {"datatype", required_argument, 0, 'd'},
+      {"root", required_argument, 0, 'r'},
+      {"blocking", required_argument, 0, 'z'},
+      {"cudagraph", required_argument, 0, 'G'},
+      {"average", required_argument, 0, 'a'},
+      {"help", no_argument, 0, 'h'},
+      {}};
+
+  while (1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts,
+                    &longindex);
+
+    if (c == -1)
+      break;
+
+    switch (c) {
+    case 't':
+      nThreads = strtol(optarg, NULL, 0);
+      break;
+    case 'g':
+      nGpus = strtol(optarg, NULL, 0);
+      break;
+    case 'b':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'minbytes'\n");
+        return -1;
+      }
+      minBytes = (size_t)parsed;
+      break;
+    case 'e':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+        return -1;
+      }
+      maxBytes = (size_t)parsed;
+      break;
+    case 'i':
+      stepBytes = strtol(optarg, NULL, 0);
+      break;
+    case 'f':
+      stepFactor = strtol(optarg, NULL, 0);
+      break;
+    case 'n':
+      iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'M':
+      multi_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+      agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+      fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+      break;
+    case 'w':
+      warmup_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'c':
+      datacheck = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'p':
+      parallel_init = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'o':
+      ncclop = ncclstringtoop(optarg);
+      break;
+    case 'd':
+      nccltype = ncclstringtotype(optarg);
+      break;
+    case 'r':
+      ncclroot = strtol(optarg, NULL, 0);
+      break;
+    case 'z':
+      blocking_coll = strtol(optarg, NULL, 0);
+      break;
+    case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) &&                \
+    CUDART_VERSION >= 11030
+      cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+      printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA "
+             "11.3. Ignoring\n");
+#endif
+      break;
+    case 'a':
+      average = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'h':
+    default:
+      if (c != 'h')
+        printf("invalid option '%c'\n", c);
+      printf("USAGE: %s \n\t"
+             "[-t,--nthreads <num threads>] \n\t"
+             "[-g,--ngpus <gpus per thread>] \n\t"
+             "[-b,--minbytes <min size in bytes>] \n\t"
+             "[-e,--maxbytes <max size in bytes>] \n\t"
+             "[-i,--stepbytes <increment size>] \n\t"
+             "[-f,--stepfactor <increment factor>] \n\t"
+             "[-n,--iters <iteration count>] \n\t"
+             "[-m,--agg_iters <aggregated iteration count>] \n\t"
+             "[-M,--multi_iters <multi seprate ncclComm iteration count>] \n\t"
+             "[-w,--warmup_iters <warmup iteration count>] \n\t"
+             "[-p,--parallel_init <0/1>] \n\t"
+             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+             "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+             "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+             "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+             "[-d,--datatype <nccltype/all>] \n\t"
+             "[-r,--root <root>] \n\t"
+             "[-z,--blocking <0/1>] \n\t"
+             "[-G,--cudagraph <num graph launches>] \n\t"
+             "[-a,--average <0/1/2/3> report average iteration time "
+             "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+             "[-h,--help]\n",
+             basename(argv[0]));
+      return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr,
+            "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+            (unsigned long long)minBytes, (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t),
+                MPI_BYTE, MPI_COMM_WORLD);
+  for (int p = 0; p < nProcs; p++) {
+    if (p == proc)
+      break;
+    if (hostHashs[p] == hostHashs[proc])
+      localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup "
+        "iters: %d iters: %d validation: %d \n",
+        nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1) ? stepFactor : stepBytes,
+        (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll)
+    PRINT("# Blocking Enabled: wait for completion and barrier after each "
+          "collective \n");
+  if (parallel_init)
+    PRINT("# Parallel Init Enabled: threads call into NcclInitRank "
+          "concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+  
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i = 0; i < nThreads * nGpus; i++) {
+    int cudaDev = localRank * nThreads * nGpus + i;
+    int rank = proc * nThreads * nGpus + i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len +=
+        snprintf(line + len, MAX_LINE - len,
+                 "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank,
+                 getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0,
+             MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines + MAX_LINE * p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for
+  // the rest.
+  // size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2);
+  // if (maxBytes > memMaxBytes) {
+  //   maxBytes = memMaxBytes;
+  //   if (proc == 0)
+  //     printf("#\n# Reducing maxBytes to %ld due to memory limitation\n",
+  //            maxBytes);
+  // }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus * nThreads];
+  void *sendbuffs[nGpus * nThreads];
+  void *recvbuffs[nGpus * nThreads];
+  void *expected[nGpus * nThreads];
+  size_t sendBytes, recvBytes;
+
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes,
+                             (size_t)nProcs * nGpus * nThreads);
+
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+    TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes,
+                            expected + i, (size_t)maxBytes,
+                            nProcs * nThreads * nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking));
+  }
+
+  // if parallel init is not selected, use main thread to initialize NCCL
+  // TODO: assign more comms when use multi size.
+  ncclComm_t *comms =
+      (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  ncclComm_t *adjusted_comms =
+    (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  if (!parallel_init) {
+    if (nProcs == 1) {
+      int gpuArray[nGpus * nThreads];
+      for (int i = 0; i < nGpus * nThreads; i++)
+        gpuArray[i] = i;
+      // OFTEST_LOG1(TEST, "CommInitAll here");
+      // use seprate comm
+      // TODO: we do not support MPI now.
+      for (int miter = 0; miter < multi_iters; miter++) {
+        NCCLCHECK(
+          ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray));
+        for (int tid = 0; tid < nThreads; tid++) {
+          memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus);
+        }
+      }
+      
+      // for (int miter = 0; miter < multi_iters; miter++) {
+      //   for (int tid = 0; tid < nThreads; tid++) {
+      //       OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus);
+      //   }
+      // }
+      // for (int tid = 0; tid < nThreads; tid++) {
+      //   for (int miter = 0; miter < multi_iters; miter++) {
+      //     OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus);
+      //   }
+      // }
+    } else {
+      NCCLCHECK(ncclGroupStart());
+      for (int i = 0; i < nGpus * nThreads; i++) {
+        CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+        //  OFTEST_LOG1(TEST, "CommInitRank here");
+        NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId,
+                                   proc * nThreads * nGpus + i));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double *delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS,
+                          cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t = 0; t < nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int *sync = (int *)calloc(2, sizeof(int));
+  int *barrier = (int *)calloc(2, sizeof(int));
+  double *reduce = (double *)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread) * nThreads);
+
+  for (int t = nThreads - 1; t >= 0; t--) {
+    threads[t].args.minbytes = minBytes;
+    threads[t].args.maxbytes = maxBytes;
+    // TODO: 不支持多个size。
+    if (minBytes != maxBytes) {
+      OFTEST_LOG1(TEST_FATAL, "Only supports single size now");
+      return testInternalError;
+    }
+    threads[t].args.stepbytes = stepBytes;
+    threads[t].args.stepfactor = stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs = nProcs;
+    threads[t].args.proc = proc;
+    threads[t].args.nThreads = nThreads;
+    threads[t].args.thread = t;
+    threads[t].args.nGpus = nGpus;
+    threads[t].args.sendbuffs = sendbuffs + t * nGpus;
+    threads[t].args.recvbuffs = sendbuffs + t * nGpus;
+    threads[t].args.expected = expected + t * nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus;
+    // for (int i = 0; i < multi_iters * nGpus; i++) {
+    //   OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i);
+    // }
+
+    threads[t].args.streams = streams + t * nGpus;
+
+    threads[t].args.barrier = (volatile int *)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double *)reduce;
+    threads[t].args.sync = (volatile int *)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t * NUM_BLOCKS);
+    threads[t].args.errors = errors + t;
+    threads[t].args.bw = bw + t;
+    threads[t].args.bw_count = bw_count + t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads + t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t = nThreads - 1; t >= 0; t--) {
+    if (t)
+      pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for (int i = 0; i < nGpus * nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    if (sendbuffs[i])
+      CUDACHECK(cudaFree((char *)sendbuffs[i]));
+    // if (recvbuffs[i])
+    //   CUDACHECK(cudaFree((char *)recvbuffs[i]));
+    if (datacheck)
+      CUDACHECK(cudaFree(expected[i]));
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char *str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0],
+        errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0],
+        check_avg_bw == -1 ? ""
+                           : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw * (0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/src_inplace/common_inplace.h b/src_inplace/common_inplace.h
new file mode 100644
index 0000000..406f634
--- /dev/null
+++ b/src_inplace/common_inplace.h
@@ -0,0 +1,289 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "nccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+
+// #define DEBUG_PRINT 1
+
+#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
+#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
+#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__)
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+typedef struct {
+  int collId;
+  int gotCqe;
+  pthread_mutex_t mutex;
+} CallBackArgs;
+
+#define MAX_COLL_NUM 10000
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx);
+  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int localRank;
+  void** sendbuffs;
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void** recvbuffs;
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expected;
+  size_t expectedBytes;
+  volatile int* sync;
+  int sync_idx;
+  volatile int* barrier;
+  int barrier_idx;
+  volatile double* reduce;
+  int syncRank;
+  int syncNranks;
+  double* deltaHost;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+#include <chrono>
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root, bool is_ofccl=false);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+
+// Provided by each coll
+extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
+extern void print_header();
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64: 
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+#endif
diff --git a/src_inplace/nccl1_compat.h b/src_inplace/nccl1_compat.h
new file mode 100644
index 0000000..020a4bc
--- /dev/null
+++ b/src_inplace/nccl1_compat.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
diff --git a/src_inplace/ofccl_all_reduce_inp.cu b/src_inplace/ofccl_all_reduce_inp.cu
new file mode 100644
index 0000000..9123391
--- /dev/null
+++ b/src_inplace/ofccl_all_reduce_inp.cu
@@ -0,0 +1,159 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common_inplace.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev);
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
+  return 0;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+
+  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl,
+  AllReducePrepare
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
+
+#pragma weak ncclTestEngine=allReduceEngine
diff --git a/src_manual_size/Makefile b/src_manual_size/Makefile
new file mode 100644
index 0000000..363ce69
--- /dev/null
+++ b/src_manual_size/Makefile
@@ -0,0 +1,109 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG_NT ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC = $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
+else
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
+endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG_NT), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
+endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
+LIBRARIES += nccl
+NVLDFLAGS += $(LIBRARIES:%=-l%)
+
+$(info CARDNAME $(NVCUFLAGS))
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := ofccl_all_reduce_ms
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+${DST_DIR}/%.o: %.cu common_ms.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_ms.o
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
new file mode 100644
index 0000000..2b8146c
--- /dev/null
+++ b/src_manual_size/common_ms.cu
@@ -0,0 +1,1578 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_ms.h"
+#include "cuda.h"
+#include "nccl.h"
+#include <cstdio>
+#include <cstring>
+#include <getopt.h>
+#include <libgen.h>
+#include <pthread.h>
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+#ifdef FULL_MS
+  size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096};
+  #ifndef IN_ORDER
+    int idxList[8][MULTI_ITERS] = {
+      {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+      {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+      },
+      {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+      {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+      },
+      {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+      {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+      },
+      {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+      {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+      }
+    };
+  #else
+    int idxList[8][MULTI_ITERS] = {
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      }
+    };
+  #endif
+#else
+  // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512};
+  // size_t idxList[8][MULTI_ITERS] = {
+  //   {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+  //   {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15},
+  //   {4, 5, 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+  //   {0, 1, 2, 3, 8, 4, 5, 6, 9, 10, 11, 7, 12, 13, 14, 15},
+  //   {0, 1, 2, 3, 8, 4, 5, 6, 9, 10, 11, 7, 12, 13, 14, 15},
+  //   {4, 2, 3, 6, 7, 8, 5, 0, 1, 9, 10, 11, 12, 13, 14, 15},
+  //   {4, 2, 3, 1, 9, 10, 11, 6, 7, 8, 5, 0, 12, 13, 14, 15},
+  //   {4, 2, 3, 1, 9, 5, 0, 12, 13, 14, 10, 11, 6, 7, 8, 15}
+  //   // {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}    
+  // };
+
+  // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 1048576};
+  // size_t idxList[8][MULTI_ITERS] = {
+  //   {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+  //   {0, 2, 1, 3, 5, 4, 6, 9, 8, 7},
+  //   {3, 2, 5, 6, 4, 7, 1, 9, 8, 0},
+  //   {1, 2, 4, 5, 7, 6, 8, 9, 3, 0},
+  //   {2, 0, 5, 7, 4, 8, 9, 6, 3, 1},
+  //   {3, 4, 8, 2, 1, 0, 5, 7, 9, 6},
+  //   {1, 3, 9, 2, 4, 7, 8, 0, 5, 6},
+  //   {2, 6, 8, 1, 3, 0, 4, 5, 7, 9}
+  // };
+  size_t countList[MULTI_ITERS] = {256, 147456};
+  size_t idxList[8][MULTI_ITERS] = {
+    {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1}
+
+    {1, 0},
+    {1, 0},
+    {0, 1},
+    {1, 0},
+    {0, 1},
+    {1, 0},
+    {0, 1}
+  };
+#endif
+
+size_t sendBytesList[MULTI_ITERS];
+size_t recvBytesList[MULTI_ITERS];
+
+#if NCCL_MAJOR >= 2
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8,
+                                           ncclUint8,
+                                           ncclInt32,
+                                           ncclUint32,
+                                           ncclInt64,
+                                           ncclUint64,
+                                           ncclHalf,
+                                           ncclFloat,
+                                           ncclDouble
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                           ,
+                                           ncclBfloat16
+#endif
+};
+const char *test_typenames[ncclNumTypes] = {"int8",
+                                            "uint8",
+                                            "int32",
+                                            "uint32",
+                                            "int64",
+                                            "uint64",
+                                            "half",
+                                            "float",
+                                            "double"
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                            ,
+                                            "bfloat16"
+#endif
+};
+int test_typenum = -1;
+
+const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+ncclRedOp_t test_ops[] = {
+    ncclSum,
+    ncclProd,
+    ncclMax,
+    ncclMin
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    ,
+    ncclAvg
+#endif
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    ,
+    ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+#endif
+};
+int test_opnum = -1;
+#else
+ncclDataType_t test_types[ncclNumTypes] = {
+    ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+const char *test_typenames[ncclNumTypes] = {"char",   "int",   "half",  "float",
+                                            "double", "int64", "uint64"};
+int test_typenum = 7;
+const char *test_opnames[] = {"sum", "prod", "max", "min"};
+ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32 * 1024 * 1024;
+static size_t maxBytes = 32 * 1024 * 1024;
+static size_t stepBytes = 1 * 1024 * 1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int multi_iters = MULTI_ITERS;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static thread_local CallBackArgs cbArgList[MAX_COLL_NUM];
+static thread_local int seenCqe[MAX_COLL_NUM];
+
+static double parsesize(const char *value) {
+  long long int units;
+  double size;
+  char size_lit;
+
+  int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+  switch (count) {
+  case 2:
+    switch (size_lit) {
+    case 'G':
+    case 'g':
+      units = 1024 * 1024 * 1024;
+      break;
+    case 'M':
+    case 'm':
+      units = 1024 * 1024;
+      break;
+    case 'K':
+    case 'k':
+      units = 1024;
+      break;
+    default:
+      return -1.0;
+    };
+    break;
+  case 1:
+    units = 1;
+    break;
+  default:
+    return -1.0;
+  }
+
+  return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch (type) {
+  case ncclHalf:
+    return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+    return 1e-2;
+#endif
+  case ncclFloat:
+    return 1e-5;
+  case ncclDouble:
+    return 1e-12;
+  case ncclInt:
+#if NCCL_MAJOR >= 2
+  case ncclUint8:
+  // case ncclInt32:
+  case ncclUint32:
+#endif
+  case ncclInt64:
+  case ncclUint64:
+    return 1e-200;
+  }
+  return 1e-200;
+}
+
+template <typename T> __device__ double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template <> __device__ double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y - x));
+}
+
+template <typename T> __device__ float toFloat(T a) { return (float)a; }
+template <> __device__ float toFloat(half a) { return __half2float(a); }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template <typename T, int BSIZE>
+__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) {
+  const T *A = (const T *)A_;
+  const T *B = (const T *)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if (delta > locmax) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1)
+        printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i,
+               toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for (int stride = BSIZE / 2; stride > 1; stride >>= 1) {
+    __syncthreads();
+    if (tid < stride)
+      temp[tid] =
+          temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride];
+  }
+  __syncthreads();
+  if (threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template <typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep + rank + offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template <>
+__device__ double testValue<double>(const size_t offset, const int rep,
+                                    const int rank) {
+  return 1.0 / (1.0 + (double)testValue<int>(offset, rep, rank));
+}
+template <>
+__device__ float testValue<float>(const size_t offset, const int rep,
+                                  const int rank) {
+  // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
+  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  // return 1.0 / 1.0;
+}
+template <>
+__device__ half testValue<half>(const size_t offset, const int rep,
+                                const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset,
+                                                  const int rep,
+                                                  const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template <typename T> __device__ T ncclOpSum(T a, T b) { return a + b; }
+template <typename T> __device__ T ncclOpProd(T a, T b) { return a * b; }
+template <typename T> __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; }
+template <typename T> __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; }
+
+// Definitions for half
+template <> __device__ half ncclOpSum(half a, half b) {
+  return __float2half(__half2float(a) + __half2float(b));
+}
+template <> __device__ half ncclOpProd(half a, half b) {
+  return __float2half(__half2float(a) * __half2float(b));
+}
+template <> __device__ half ncclOpMax(half a, half b) {
+  return __half2float(a) > __half2float(b) ? a : b;
+}
+template <> __device__ half ncclOpMin(half a, half b) {
+  return __half2float(a) < __half2float(b) ? a : b;
+}
+
+template <typename T> __device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template <typename T> __device__ T ncclPPOpMul(T x, int arg) {
+  return x * T(arg);
+}
+template <typename T> __device__ T ncclPPOpDiv(T x, int arg) {
+  return x / T(arg);
+}
+template <> __device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x) * float(arg));
+}
+template <> __device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x) / n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x) * float(arg));
+}
+template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x) / n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; }
+
+template <typename T, T (*Op)(T, T), T (*PreOp)(T, int), T (*PostOp)(T, int)>
+__global__ void InitDataReduceKernel(T *data, const size_t N,
+                                     const size_t offset, const int rep,
+                                     const int nranks) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x) {
+    T val = testValue<T>(o + offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i = 1; i < nranks; i++) {
+      T val1 = testValue<T>(o + offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop)                                          \
+  (void *)InitDataReduceKernel<type, op<type>, preop<type>, postop<type>>
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv),               \
+      KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = {
+    OPS(int8_t),       OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t),
+    OPS(uint64_t),     OPS(half),    OPS(float),   OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void *data, const size_t count, const size_t offset,
+                            ncclDataType_t type, ncclRedOp_t op, const int rep,
+                            const int nranks) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep,
+                   (void *)&nranks};
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid,
+                             block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template <typename T>
+__global__ void InitDataKernel(T *data, const size_t N, const int rep,
+                               const int rank) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void *const initDataKerns[ncclNumTypes] = {
+    (void *)InitDataKernel<int8_t>,       (void *)InitDataKernel<uint8_t>,
+    (void *)InitDataKernel<int32_t>,      (void *)InitDataKernel<uint32_t>,
+    (void *)InitDataKernel<int64_t>,      (void *)InitDataKernel<uint64_t>,
+    (void *)InitDataKernel<half>,         (void *)InitDataKernel<float>,
+    (void *)InitDataKernel<double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    (void *)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template <typename T>
+testResult_t InitDataType(void *dest, const size_t N, const int rep,
+                          const int rank) {
+  T *ptr = (T *)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void *data, const size_t count, ncclDataType_t type,
+                      const int rep, const int rank) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank};
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs *args) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  args->barrier_idx = !args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs *args, double *value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1)
+      val += val2;
+    if (average == 2)
+      val = std::min(val, val2);
+    if (average == 3)
+      val = std::max(val, val2);
+  }
+  if (average || args->thread == 0)
+    args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1,
+                    MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1)
+      args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads;
+    args->reduce[1 - args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx = !args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams,
+                                   ncclComm_t *comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int *done = (int *)malloc(sizeof(int) * ngpus);
+  memset(done, 0, sizeof(int) * ngpus);
+  while (remaining) {
+    int idle = 1;
+    for (int i = 0; i < ngpus; i++) {
+      if (done[i])
+        continue;
+
+      cudaErr = cudaStreamQuery(streams[i]);
+      if (cudaErr == cudaSuccess) {
+        done[i] = 1;
+        remaining--;
+        idle = 0;
+        continue;
+      }
+
+      if (cudaErr != cudaErrorNotReady)
+        CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+      if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) {
+        ncclResult_t ncclAsyncErr;
+        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+        if (ncclAsyncErr != ncclSuccess) {
+          // An asynchronous error happened. Stop the operation and destroy
+          // the communicator
+          for (int i = 0; i < ngpus; i++)
+            NCCLCHECK(ncclCommAbort(comms[i]));
+          // Abort the perf test
+          NCCLCHECK(ncclAsyncErr);
+        }
+      }
+#endif
+    }
+
+    // We might want to let other threads (including NCCL threads) use the CPU.
+    if (idle)
+      pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+  if (args->nGpus != 1) {
+    OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs");
+    return testInternalError;
+  }
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
+
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    ncclRedOp_t op;
+    
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart");
+    NCCLCHECK(ncclGroupStart());
+  }
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm);
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(comm, &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    // char *recvBuff = ((char *)args->recvbuffs[i]) + shift;
+    // char *sendBuff = ((char *)args->sendbuffs[i]) + shift;
+    char *recvBuff = (char *)(args->recvbuffs[miter]);
+    char *sendBuff = (char *)(args->sendbuffs[miter]);
+    
+    // int cudaDev;
+    // cudaGetDevice(&cudaDev);
+    // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, RUN sendbuff @ %p, recvbuff @ %p", cudaDev, miter, sendBuff, recvBuff);
+
+    ncclRedOp_t op;
+
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    // miter就是collId。
+    TESTCHECK(args->collTest->runColl(
+        (void *)(sendBuff),
+        (void *)(recvBuff), miter, cbArgList + miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd");
+    NCCLCHECK(ncclGroupEnd());
+  }
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll)
+    Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs *args, int iter=0) {
+  if (blocking_coll)
+    return testSuccess;
+    
+  
+  int gotCqeCnt = 0;
+  while (gotCqeCnt < multi_iters) {
+    for (int i = 0; i < multi_iters; i++) {
+      pthread_mutex_lock(&cbArgList[i].mutex);
+      if (cbArgList[i].gotCqe == 1) {
+        if (seenCqe[i] == 0) {
+          gotCqeCnt++;
+          seenCqe[i] = 1;
+          
+          // int cudaDev;
+          // CUDACHECK(cudaGetDevice(&cudaDev));
+          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get %dth cqe for coll_id = %d", pthread_self(), cudaDev, iter, i);
+
+        }
+      }
+      pthread_mutex_unlock(&cbArgList[i].mutex);
+    }
+  }
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) {
+
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+
+  size_t count = args->nbytes / wordSize(type);
+
+  Barrier(args);
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 1; iter <= iters; iter++) {
+    // 在这个地方改变miter的遍历顺序，起到乱序调用的作用。
+    for (int miter_idx = 0; miter_idx < multi_iters; miter_idx++) { // for (int miter = 0; miter < multi_iters; miter++) {
+      int miter = idxList[cudaDev][miter_idx];
+      // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke %dth startColl iter for coll_id = %d", pthread_self(), cudaDev, iter, miter);
+      seenCqe[miter] = 0;
+      usleep(200);
+      TESTCHECK(startColl(args, type, op, root, in_place,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+
+    TESTCHECK(completeColl(args, iter));
+
+    usleep(100000);
+    OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+  }
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec =
+      std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec / (iters * agg_iters *multi_iters);
+  if (cudaGraphLaunches >= 1)
+    deltaSec = deltaSec / cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw,
+                        args->nProcs * args->nThreads * args->nGpus);
+
+  Barrier(args);
+
+  ofcclDestroy(rankCtx);
+
+  double maxDelta = 0;
+  // static __thread int rep = 0; // 为了再次初始化buffer的参数，没用了。
+  // rep++;
+  if (datacheck) {
+
+    TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+    //aggregate delta from all threads and procs
+    Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec * 1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+    PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) {
+  int nranks = args->nProcs * args->nGpus * args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset,
+      recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount,
+                                   &sendInplaceOffset, &recvInplaceOffset,
+                                   (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
+                      const char *typeName, ncclRedOp_t op, const char *opName,
+                      int root, bool is_ofccl) {
+  // 首先创建ofcclRankCtx_t
+  int thrdCudaDev;
+  CUDACHECK(cudaGetDevice(&thrdCudaDev));
+  ofcclRankCtx_t rankCtx;
+  ofcclInitRankCtx(&rankCtx, thrdCudaDev);
+
+  // prepare for all size. op, type traversed in the caller.
+  // TODO: if we support multi size, each size should use a separate ncclComm
+
+  for (int miter = 0; miter < multi_iters; miter++) {
+    args->nbytes = sendBytesList[miter];
+    args->sendBytes = args->nbytes;
+    TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx));
+  }
+
+  // 在这里完成check数据的准备；
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
+    
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev);
+  }
+  
+  // ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
+  ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
+
+  // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
+  // warmup还是需要开，不然ofccl性能拉胯。
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    for (int miter = 0; miter < multi_iters; miter++) {
+      args->nbytes = sendBytesList[miter];
+      args->sendBytes = args->nbytes;
+      seenCqe[miter] = 0;
+      TESTCHECK(startColl(args, type, op, root, 0,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+    TESTCHECK(completeColl(args));
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), thrdCudaDev, iter, multi_iters);
+  }
+
+  print_line_header(max(args->sendBytes, args->expectedBytes),
+                    args->nbytes / wordSize(type), typeName, opName, root);
+  TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
+  // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边，所以没办法在这里通过调用两次BenchTime来先做out-of-place，再做in-place。像这样的话，可以在BenchTime里加个循环。
+  PRINT("\n");
+
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadRunTests");
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank * args->nThreads * args->nGpus +
+              args->thread * args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype,
+                                   test_typenames[nccltype],
+                                   (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadInit");
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks = args->nProcs * args->nThreads * args->nGpus;
+
+  // set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+    int rank = args->proc * args->nThreads * args->nGpus +
+               args->thread * args->nGpus + i;
+    int gpuid = args->localRank * args->nThreads * args->nGpus +
+                args->thread * args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    // OFTEST_LOG1(TEST, "CommInitRank here");
+    NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i = 0; i < args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void *threadLauncher(void *thread_) {
+  struct testThread *thread = (struct testThread *)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread *thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff,
+                           size_t recvBytes, void **expected, size_t nbytes,
+                           int nranks) {
+  CUDACHECK(cudaMalloc(sendbuff, nbytes));
+  CUDACHECK(cudaMalloc(recvbuff, nbytes));
+  if (datacheck)
+    CUDACHECK(cudaMalloc(expected, recvBytes));
+  return testSuccess;
+}
+
+testResult_t AllocateBuffLists(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes) {
+  CUDACHECK(cudaMalloc(sendbuff, sendBytes));
+  CUDACHECK(cudaMalloc(recvbuff, recvBytes));
+  return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char *argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+  ncclGetVersion(&test_ncclVersion);
+#else
+  test_ncclVersion = NCCL_VERSION_CODE;
+#endif
+// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE,
+// test_ncclVersion);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0)
+  test_opnum = 4;
+  test_typenum = 9;
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 10, 0)) {
+    test_opnum++; // ncclAvg
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    test_typenum++; // bfloat16
+#endif
+  }
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 11, 0)) {
+    test_opnum++; // PreMulSum
+  }
+#endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+      {"nthreads", required_argument, 0, 't'},
+      {"ngpus", required_argument, 0, 'g'},
+      {"minbytes", required_argument, 0, 'b'},
+      {"maxbytes", required_argument, 0, 'e'},
+      {"stepbytes", required_argument, 0, 'i'},
+      {"stepfactor", required_argument, 0, 'f'},
+      {"iters", required_argument, 0, 'n'},
+      {"agg_iters", required_argument, 0, 'm'},
+      {"multi_iters", required_argument, 0, 'M'},
+      {"warmup_iters", required_argument, 0, 'w'},
+      {"parallel_init", required_argument, 0, 'p'},
+      {"check", required_argument, 0, 'c'},
+      {"op", required_argument, 0, 'o'},
+      {"datatype", required_argument, 0, 'd'},
+      {"root", required_argument, 0, 'r'},
+      {"blocking", required_argument, 0, 'z'},
+      {"cudagraph", required_argument, 0, 'G'},
+      {"average", required_argument, 0, 'a'},
+      {"help", no_argument, 0, 'h'},
+      {}};
+
+  while (1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts,
+                    &longindex);
+
+    if (c == -1)
+      break;
+
+    switch (c) {
+    case 't':
+      nThreads = strtol(optarg, NULL, 0);
+      break;
+    case 'g':
+      nGpus = strtol(optarg, NULL, 0);
+      break;
+    case 'b':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'minbytes'\n");
+        return -1;
+      }
+      minBytes = (size_t)parsed;
+      break;
+    case 'e':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+        return -1;
+      }
+      maxBytes = (size_t)parsed;
+      break;
+    case 'i':
+      stepBytes = strtol(optarg, NULL, 0);
+      break;
+    case 'f':
+      stepFactor = strtol(optarg, NULL, 0);
+      break;
+    case 'n':
+      iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'M':
+      // multi_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+      agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+      fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+      break;
+    case 'w':
+      warmup_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'c':
+      datacheck = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'p':
+      parallel_init = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'o':
+      ncclop = ncclstringtoop(optarg);
+      break;
+    case 'd':
+      nccltype = ncclstringtotype(optarg);
+      break;
+    case 'r':
+      ncclroot = strtol(optarg, NULL, 0);
+      break;
+    case 'z':
+      blocking_coll = strtol(optarg, NULL, 0);
+      break;
+    case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) &&                \
+    CUDART_VERSION >= 11030
+      cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+      printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA "
+             "11.3. Ignoring\n");
+#endif
+      break;
+    case 'a':
+      average = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'h':
+    default:
+      if (c != 'h')
+        printf("invalid option '%c'\n", c);
+      printf("USAGE: %s \n\t"
+             "[-t,--nthreads <num threads>] \n\t"
+             "[-g,--ngpus <gpus per thread>] \n\t"
+             "[-b,--minbytes <min size in bytes>] \n\t"
+             "[-e,--maxbytes <max size in bytes>] \n\t"
+             "[-i,--stepbytes <increment size>] \n\t"
+             "[-f,--stepfactor <increment factor>] \n\t"
+             "[-n,--iters <iteration count>] \n\t"
+             "[-m,--agg_iters <aggregated iteration count>] \n\t"
+             "[-M,--multi_iters <multi seprate ncclComm iteration count>] \n\t"
+             "[-w,--warmup_iters <warmup iteration count>] \n\t"
+             "[-p,--parallel_init <0/1>] \n\t"
+             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+             "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+             "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+             "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+             "[-d,--datatype <nccltype/all>] \n\t"
+             "[-r,--root <root>] \n\t"
+             "[-z,--blocking <0/1>] \n\t"
+             "[-G,--cudagraph <num graph launches>] \n\t"
+             "[-a,--average <0/1/2/3> report average iteration time "
+             "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+             "[-h,--help]\n",
+             basename(argv[0]));
+      return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr,
+            "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+            (unsigned long long)minBytes, (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t),
+                MPI_BYTE, MPI_COMM_WORLD);
+  for (int p = 0; p < nProcs; p++) {
+    if (p == proc)
+      break;
+    if (hostHashs[p] == hostHashs[proc])
+      localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup "
+        "iters: %d iters: %d validation: %d \n",
+        nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1) ? stepFactor : stepBytes,
+        (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll)
+    PRINT("# Blocking Enabled: wait for completion and barrier after each "
+          "collective \n");
+  if (parallel_init)
+    PRINT("# Parallel Init Enabled: threads call into NcclInitRank "
+          "concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+  
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  if (multi_iters != MULTI_ITERS) {
+    OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters);
+  }
+  OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i = 0; i < nThreads * nGpus; i++) {
+    int cudaDev = localRank * nThreads * nGpus + i;
+    int rank = proc * nThreads * nGpus + i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len +=
+        snprintf(line + len, MAX_LINE - len,
+                 "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank,
+                 getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0,
+             MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines + MAX_LINE * p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for
+  // the rest.
+  size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0)
+      printf("#\n# Reducing maxBytes to %ld due to memory limitation\n",
+             maxBytes);
+  }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus * nThreads];
+  void *sendbuffs[nGpus * nThreads][MULTI_ITERS];
+  void *recvbuffs[nGpus * nThreads][MULTI_ITERS];
+  void *expected[nGpus * nThreads];
+  // size_t sendBytes, recvBytes;
+
+  // ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes,
+  //                            (size_t)nProcs * nGpus * nThreads);
+
+  ncclTestEngine.getCollByteCountList(sendBytesList, recvBytesList, countList, multi_iters);
+  // for (int i = 0; i < MULTI_ITERS; i++) {
+  //   OFTEST_LOG(TEST, "sendBytesList[%d] = %lu, recvBytesList[%d] = %lu", i, sendBytesList[i], i, recvBytesList[i]);
+  // }
+
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+    // 这里的调用是给每个线程分配。
+    // TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes,
+    //                         expected + i, (size_t)maxBytes,
+    //                         nProcs * nThreads * nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking));
+    
+    for (int j = 0; j < multi_iters; j++) {
+      AllocateBuffLists(&sendbuffs[i][j], sendBytesList[j], &recvbuffs[i][j], recvBytesList[j]);
+
+      // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, ALLOCATE sendbuff @ %p, recvbuff @ %p", i, j, sendbuffs[i][j], recvbuffs[i][j]);
+    }
+  }
+
+  // if parallel init is not selected, use main thread to initialize NCCL
+  // TODO: assign more comms when use multi size.
+  ncclComm_t *comms =
+      (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  ncclComm_t *adjusted_comms =
+    (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  if (!parallel_init) {
+    if (nProcs == 1) {
+      int gpuArray[nGpus * nThreads];
+      for (int i = 0; i < nGpus * nThreads; i++)
+        gpuArray[i] = i;
+      // OFTEST_LOG1(TEST, "CommInitAll here");
+      // use seprate comm
+      // TODO: we do not support MPI now.
+      for (int miter = 0; miter < multi_iters; miter++) {
+        NCCLCHECK(
+          ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray));
+        for (int tid = 0; tid < nThreads; tid++) {
+          memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus);
+        }
+      }
+      
+      // for (int miter = 0; miter < multi_iters; miter++) {
+      //   for (int tid = 0; tid < nThreads; tid++) {
+      //       OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus);
+      //   }
+      // }
+      // for (int tid = 0; tid < nThreads; tid++) {
+      //   for (int miter = 0; miter < multi_iters; miter++) {
+      //     OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus);
+      //   }
+      // }
+    } else {
+      NCCLCHECK(ncclGroupStart());
+      for (int i = 0; i < nGpus * nThreads; i++) {
+        CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+        //  OFTEST_LOG1(TEST, "CommInitRank here");
+        NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId,
+                                   proc * nThreads * nGpus + i));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double *delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS,
+                          cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t = 0; t < nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int *sync = (int *)calloc(2, sizeof(int));
+  int *barrier = (int *)calloc(2, sizeof(int));
+  double *reduce = (double *)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread) * nThreads);
+
+  for (int t = nThreads - 1; t >= 0; t--) {
+    threads[t].args.minbytes = minBytes;
+    threads[t].args.maxbytes = maxBytes;
+    // TODO: 不支持多个size。
+    if (minBytes != maxBytes) {
+      OFTEST_LOG1(TEST_FATAL, "Only supports single size now");
+      return testInternalError;
+    }
+    threads[t].args.stepbytes = stepBytes;
+    threads[t].args.stepfactor = stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs = nProcs;
+    threads[t].args.proc = proc;
+    threads[t].args.nThreads = nThreads;
+    threads[t].args.thread = t;
+    threads[t].args.nGpus = nGpus;
+    // threads[t].args.sendbuffs = sendbuffs[t];
+    // threads[t].args.recvbuffs = recvbuffs[t];
+    for (int j = 0; j < MULTI_ITERS; j++) {
+      threads[t].args.sendbuffs[j] = sendbuffs[t][j];
+      threads[t].args.recvbuffs[j] = recvbuffs[t][j];
+      // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, DISPATCH SRC sendbuff @ %p, recvbuff @ %p", t, j, sendbuffs[t][j], recvbuffs[t][j]);
+      // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, DISPATCH IN ARGS sendbuff @ %p, recvbuff @ %p", t, j, threads[t].args.sendbuffs[j], threads[t].args.recvbuffs[j]);
+    }
+    threads[t].args.expected = expected + t * nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus;
+    // for (int i = 0; i < multi_iters * nGpus; i++) {
+    //   OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i);
+    // }
+
+    threads[t].args.streams = streams + t * nGpus;
+
+    threads[t].args.barrier = (volatile int *)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double *)reduce;
+    threads[t].args.sync = (volatile int *)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t * NUM_BLOCKS);
+    threads[t].args.errors = errors + t;
+    threads[t].args.bw = bw + t;
+    threads[t].args.bw_count = bw_count + t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads + t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t = nThreads - 1; t >= 0; t--) {
+    if (t)
+      pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for (int i = 0; i < nGpus * nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    for (int j = 0; j < MULTI_ITERS; j++) {
+      CUDACHECK(cudaFree((char *)sendbuffs[i][j]));
+      CUDACHECK(cudaFree((char *)recvbuffs[i][j]));
+    }
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char *str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0],
+        errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0],
+        check_avg_bw == -1 ? ""
+                           : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw * (0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h
new file mode 100644
index 0000000..14f0ffb
--- /dev/null
+++ b/src_manual_size/common_ms.h
@@ -0,0 +1,303 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "nccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#include <unistd.h> // usleep
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+
+// #define DEBUG_PRINT 1
+
+#define FULL_MS 1
+
+#ifdef FULL_MS
+  #define MULTI_ITERS 161
+#else
+  // #define MULTI_ITERS 16
+  #define MULTI_ITERS 2
+#endif
+
+// #define IN_ORDER 1
+
+#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
+#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
+#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__)
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+typedef struct {
+  int collId;
+  int gotCqe;
+  int cudaDev;
+  pthread_mutex_t mutex;
+} CallBackArgs;
+
+#define MAX_COLL_NUM 10000
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx);
+  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+  void (*getCollByteCountList)(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int localRank;
+  void* sendbuffs[MULTI_ITERS];
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void* recvbuffs[MULTI_ITERS];
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expected;
+  size_t expectedBytes;
+  volatile int* sync;
+  int sync_idx;
+  volatile int* barrier;
+  int barrier_idx;
+  volatile double* reduce;
+  int syncRank;
+  int syncNranks;
+  double* deltaHost;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+#include <chrono>
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root, bool is_ofccl=false);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+
+// Provided by each coll
+extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
+extern void print_header();
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64: 
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+#endif
diff --git a/src_manual_size/nccl1_compat.h b/src_manual_size/nccl1_compat.h
new file mode 100644
index 0000000..020a4bc
--- /dev/null
+++ b/src_manual_size/nccl1_compat.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu
new file mode 100644
index 0000000..ccde169
--- /dev/null
+++ b/src_manual_size/ofccl_all_reduce_ms.cu
@@ -0,0 +1,175 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common_ms.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+  OFTEST_LOG(TEST, "Hi <%lu> Rank<%d>, sendcount = %p, recvcount = %p, paramcount = %p, sendInplaceOffset = %p, recvInplaceOffset = %p, count = %lu, nranks = %d", pthread_self(), cudaDev, sendcount, recvcount, paramcount, sendInplaceOffset, recvInplaceOffset, count, nranks);
+
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+void AllReduceGetCollByteCountList(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen) { // listLen就等于multi_iter
+  // OFTEST_LOG1(TEST, "hi");
+  for (int i = 0; i < listLen; i++) {
+    *(sendCntList + i) = *(countList + i);
+    *(recvCntList + i) = *(countList + i);
+  }
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev);
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev)); // 这个函数之后在poller线程里调用的，所以这个获得的dev应该是不对的。
+
+  // int collId = ((CallBackArgs *)args)->collId;
+  // int cudaDev = ((CallBackArgs *)args)->cudaDev;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
+
+  return 0;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  args->collId = collId;
+  args->gotCqe = 0;
+  args->cudaDev = cudaDev;
+  pthread_mutex_init(&args->mutex, NULL);
+
+  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d", pthread_self(), cudaDev, collId);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl,
+  AllReducePrepare
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest,
+  AllReduceGetCollByteCountList
+};
+
+#pragma weak ncclTestEngine=allReduceEngine
\ No newline at end of file
diff --git a/src_nccl_manual_size/Makefile b/src_nccl_manual_size/Makefile
new file mode 100644
index 0000000..3851d9d
--- /dev/null
+++ b/src_nccl_manual_size/Makefile
@@ -0,0 +1,109 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG_NT ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC = $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
+else
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
+endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG_NT), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
+endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
+LIBRARIES += nccl
+NVLDFLAGS += $(LIBRARIES:%=-l%)
+
+$(info CARDNAME $(NVCUFLAGS))
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := all_reduce_nccl_ms
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+${DST_DIR}/%.o: %.cu common_nccl_ms.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_nccl_ms.o
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/src_nccl_manual_size/all_reduce_nccl_ms.cu b/src_nccl_manual_size/all_reduce_nccl_ms.cu
new file mode 100644
index 0000000..7bab5c2
--- /dev/null
+++ b/src_nccl_manual_size/all_reduce_nccl_ms.cu
@@ -0,0 +1,123 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common_nccl_ms.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+void AllReduceGetCollByteCountList(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen) { // listLen就等于agg_iters
+  // OFTEST_LOG1(TEST, "hi");
+  for (int i = 0; i < listLen; i++) {
+    *(sendCntList + i) = *(countList + i);
+    *(recvCntList + i) = *(countList + i);
+  }
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest,
+  AllReduceGetCollByteCountList
+};
+
+#pragma weak ncclTestEngine=allReduceEngine
diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu
new file mode 100644
index 0000000..e4449e1
--- /dev/null
+++ b/src_nccl_manual_size/common_nccl_ms.cu
@@ -0,0 +1,1143 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_nccl_ms.h"
+#include <pthread.h>
+#include <cstdio>
+#include <getopt.h>
+#include <libgen.h>
+#include "cuda.h"
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+// TODO: 丑丑地搞个全局变量
+// size_t countList[AGG_ITERS] = {4000, 8192000};
+size_t countList[AGG_ITERS] = {4000, 8192000};
+size_t sendBytesList[AGG_ITERS];
+size_t recvBytesList[AGG_ITERS];
+// ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat};
+ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat};
+int idxList[8][AGG_ITERS] = {
+  {0, 1},
+  {1, 0}
+};
+
+#if NCCL_MAJOR >= 2
+  ncclDataType_t test_types[ncclNumTypes] = {
+    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclBfloat16
+  #endif
+  };
+  const char *test_typenames[ncclNumTypes] = {
+    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , "bfloat16"
+  #endif
+  };
+  int test_typenum = -1;
+
+  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclAvg
+  #endif
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+  #endif
+  };
+  int test_opnum = -1;
+#else
+  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+  int test_typenum = 7;
+  const char *test_opnames[] = {"sum", "prod", "max", "min"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+  int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32*1024*1024;
+static size_t maxBytes = 32*1024*1024;
+static size_t stepBytes = 1*1024*1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = AGG_ITERS;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static double parsesize(const char *value) {
+    long long int units;
+    double size;
+    char size_lit;
+
+    int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+    switch (count) {
+    case 2:
+      switch (size_lit) {
+      case 'G':
+      case 'g':
+        units = 1024*1024*1024;
+        break;
+      case 'M':
+      case 'm':
+        units = 1024*1024;
+        break;
+      case 'K':
+      case 'k':
+        units = 1024;
+        break;
+      default:
+        return -1.0;
+      };
+      break;
+    case 1:
+      units = 1;
+      break;
+    default:
+      return -1.0;
+    }
+
+    return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch(type) {
+    case ncclHalf: return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16: return 1e-2;
+#endif
+    case ncclFloat: return 1e-5;
+    case ncclDouble: return 1e-12;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+    //case ncclInt32:
+    case ncclUint32:
+#endif
+    case ncclInt64:
+    case ncclUint64: return 1e-200;
+  }
+  return 1e-200;
+}
+
+template<typename T> __device__
+double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template<> __device__
+double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y-x));
+}
+
+template<typename T> __device__
+float toFloat(T a) {
+  return (float)a;
+}
+template<> __device__
+float toFloat(half a) {
+  return __half2float(a);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<> __device__
+float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template<typename T, int BSIZE> __global__
+void deltaKern(void* A_, void* B_, size_t count, double* max) {
+  const T* A = (const T*)A_;
+  const T* B = (const T*)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if( delta > locmax ) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
+    __syncthreads();
+    if( tid < stride )
+      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
+  }
+  __syncthreads();
+  if( threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template<typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep+rank+offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template<>
+__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template<typename T>
+__device__ T ncclOpSum(T a, T b) { return a+b; }
+template<typename T>
+__device__ T ncclOpProd(T a, T b) { return a*b; }
+template<typename T>
+__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
+template<typename T>
+__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
+
+// Definitions for half
+template<>
+__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
+template<>
+__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
+template<>
+__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
+template<>
+__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
+
+template<typename T>
+__device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template<typename T>
+__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
+template<typename T>
+__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
+template<>
+__device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x)*float(arg));
+}
+template<>
+__device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x)/n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x)*float(arg));
+}
+template<>
+__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x)/n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) {
+  return 1 + rank%2;
+}
+
+template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
+__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
+    T val = testValue<T>(o+offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i=1; i<nranks; i++) {
+      T val1 = testValue<T>(o+offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
+    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
+  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template<typename T>
+__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void* const initDataKerns[ncclNumTypes] = {
+  (void*)InitDataKernel<  int8_t>,
+  (void*)InitDataKernel< uint8_t>,
+  (void*)InitDataKernel< int32_t>,
+  (void*)InitDataKernel<uint32_t>,
+  (void*)InitDataKernel< int64_t>,
+  (void*)InitDataKernel<uint64_t>,
+  (void*)InitDataKernel<    half>,
+  (void*)InitDataKernel<   float>,
+  (void*)InitDataKernel<  double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  (void*)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template<typename T>
+testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
+  T* ptr = (T*)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs* args) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  args->barrier_idx=!args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs* args, double* value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1) val += val2;
+    if (average == 2) val = std::min(val, val2);
+    if (average == 3) val = std::max(val, val2);
+  }
+  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
+    args->reduce[1-args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx=!args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int* done = (int*)malloc(sizeof(int)*ngpus);
+  memset(done, 0, sizeof(int)*ngpus);
+  while (remaining) {
+   int idle = 1;
+   for (int i=0; i<ngpus; i++) {
+     if (done[i]) continue;
+
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
+       done[i] = 1;
+       remaining--;
+       idle = 0;
+       continue;
+     }
+
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
+       ncclResult_t ncclAsyncErr;
+       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+       if (ncclAsyncErr != ncclSuccess) {
+         // An asynchronous error happened. Stop the operation and destroy
+         // the communicator
+         for (int i=0; i<ngpus; i++)
+           NCCLCHECK(ncclCommAbort(comms[i]));
+         // Abort the perf test
+         NCCLCHECK(ncclAsyncErr);
+       }
+     }
+#endif
+   }
+
+   // We might want to let other threads (including NCCL threads) use the CPU.
+   if (idle) pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    char *recvBuff = (char *)(args->recvbuffs[iter]);
+    char *sendBuff = (char *)(args->sendbuffs[iter]);
+    ncclRedOp_t op;
+
+    if(opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    else {
+      union {
+        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
+        half f16; float f32; double f64;
+        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+        #endif
+      };
+      int scalar = preMulScalar(rank);
+      switch(type) {
+      case ncclInt8: i8 = int8_t(scalar); break;
+      case ncclUint8: u8 = uint8_t(scalar); break;
+      case ncclInt32: i32 = int32_t(scalar); break;
+      case ncclUint32: u32 = uint32_t(scalar); break;
+      case ncclInt64: i64 = int32_t(scalar); break;
+      case ncclUint64: u64 = uint32_t(scalar); break;
+      case ncclFloat16: f16 = __float2half(float(scalar)); break;
+      case ncclFloat32: f32 = float(scalar); break;
+      case ncclFloat64: f64 = double(scalar); break;
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
+      #endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
+    }
+    #endif
+
+    TESTCHECK(args->collTest->runColl(
+          (void*)(sendBuff),
+          (void*)(recvBuff),
+        count, type, op, root, args->comms[i], args->streams[i]));
+
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    if(opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
+    }
+    #endif
+  }
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd());
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll) Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs* args) {
+  if (blocking_coll) return testSuccess;
+
+  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+  size_t count = args->nbytes / wordSize(type);
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+
+  Barrier(args);
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < iters; iter++) {
+    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
+    // for (int aiter = 0; aiter < agg_iters; aiter++) {
+    for (int aiter_idx = 0; aiter_idx < agg_iters; aiter_idx++) {
+      int aiter = idxList[cudaDev][aiter_idx];
+      args->nbytes = sendBytesList[aiter];
+      args->sendBytes = args->nbytes;
+      TESTCHECK(startColl(args, typeList[aiter], op, root, in_place, iter*agg_iters+aiter));
+    }
+    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
+  }
+
+  TESTCHECK(completeColl(args));
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec/(iters*agg_iters);
+  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+
+  Barrier(args);
+
+  double maxDelta = 0;
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) {
+      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+
+      //aggregate delta from all threads and procs
+      Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec*1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+
+  // Benchmark
+  args->nbytes = sendBytesList[0];
+  args->sendBytes = args->nbytes;
+  print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+  TESTCHECK(BenchTime(args, type, op, root, 0));
+  // TODO: 实测是否恢复？
+  // TESTCHECK(BenchTime(args, type, op, root, 1));
+  PRINT("\n");
+
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs* args) {
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs* args) {
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks =  args->nProcs*args->nThreads*args->nGpus;
+
+  //set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i=0; i<args->nGpus; i++) {
+    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i=0; i<args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void* threadLauncher(void* thread_) {
+  struct testThread* thread = (struct testThread*)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread* thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
+    return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char* argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+    ncclGetVersion(&test_ncclVersion);
+  #else
+    test_ncclVersion = NCCL_VERSION_CODE;
+  #endif
+  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
+    test_opnum = 4;
+    test_typenum = 9;
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_opnum++; // ncclAvg
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+        test_typenum++; // bfloat16
+      #endif
+    }
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
+      test_opnum++; // PreMulSum
+    }
+  #endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+    {"nthreads", required_argument, 0, 't'},
+    {"ngpus", required_argument, 0, 'g'},
+    {"minbytes", required_argument, 0, 'b'},
+    {"maxbytes", required_argument, 0, 'e'},
+    {"stepbytes", required_argument, 0, 'i'},
+    {"stepfactor", required_argument, 0, 'f'},
+    {"iters", required_argument, 0, 'n'},
+    {"agg_iters", required_argument, 0, 'm'},
+    {"warmup_iters", required_argument, 0, 'w'},
+    {"parallel_init", required_argument, 0, 'p'},
+    {"check", required_argument, 0, 'c'},
+    {"op", required_argument, 0, 'o'},
+    {"datatype", required_argument, 0, 'd'},
+    {"root", required_argument, 0, 'r'},
+    {"blocking", required_argument, 0, 'z'},
+    {"cudagraph", required_argument, 0, 'G'},
+    {"average", required_argument, 0, 'a'},
+    {"help", no_argument, 0, 'h'},
+    {}
+  };
+
+  while(1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
+
+    if (c == -1)
+      break;
+
+    switch(c) {
+      case 't':
+        nThreads = strtol(optarg, NULL, 0);
+        break;
+      case 'g':
+        nGpus = strtol(optarg, NULL, 0);
+        break;
+      case 'b':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'minbytes'\n");
+          return -1;
+        }
+        minBytes = (size_t)parsed;
+        break;
+      case 'e':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+          return -1;
+        }
+        maxBytes = (size_t)parsed;
+        break;
+      case 'i':
+        stepBytes = strtol(optarg, NULL, 0);
+        break;
+      case 'f':
+        stepFactor = strtol(optarg, NULL, 0);
+        break;
+      case 'n':
+        iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+        agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+        break;
+      case 'w':
+        warmup_iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'c':
+        datacheck = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'p':
+        parallel_init = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'o':
+        ncclop = ncclstringtoop(optarg);
+        break;
+      case 'd':
+        nccltype = ncclstringtotype(optarg);
+        break;
+      case 'r':
+        ncclroot = strtol(optarg, NULL, 0);
+        break;
+      case 'z':
+        blocking_coll = strtol(optarg, NULL, 0);
+        break;
+      case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
+        cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+#endif
+        break;
+      case 'a':
+        average = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'h':
+      default:
+        if (c != 'h') printf("invalid option '%c'\n", c);
+        printf("USAGE: %s \n\t"
+            "[-t,--nthreads <num threads>] \n\t"
+            "[-g,--ngpus <gpus per thread>] \n\t"
+            "[-b,--minbytes <min size in bytes>] \n\t"
+            "[-e,--maxbytes <max size in bytes>] \n\t"
+            "[-i,--stepbytes <increment size>] \n\t"
+            "[-f,--stepfactor <increment factor>] \n\t"
+            "[-n,--iters <iteration count>] \n\t"
+            "[-m,--agg_iters <aggregated iteration count>] \n\t"
+            "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-p,--parallel_init <0/1>] \n\t"
+            "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+            "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+            "[-d,--datatype <nccltype/all>] \n\t"
+            "[-r,--root <root>] \n\t"
+            "[-z,--blocking <0/1>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
+            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+            "[-h,--help]\n",
+	    basename(argv[0]));
+	return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+           (unsigned long long)minBytes,
+           (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t AllocateBuffLists(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes) {
+  CUDACHECK(cudaMalloc(sendbuff, sendBytes));
+  CUDACHECK(cudaMalloc(recvbuff, recvBytes));
+  return testSuccess;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
+  for (int p=0; p<nProcs; p++) {
+    if (p == proc) break;
+    if (hostHashs[p] == hostHashs[proc]) localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i=0; i<nThreads*nGpus; i++) {
+    int cudaDev = localRank*nThreads*nGpus+i;
+    int rank = proc*nThreads*nGpus+i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines+MAX_LINE*p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
+  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+  }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus*nThreads];
+  void* sendbuffs[nGpus*nThreads][AGG_ITERS];
+  void* recvbuffs[nGpus*nThreads][AGG_ITERS];
+  void* expected[nGpus*nThreads];
+  // size_t sendBytes, recvBytes;
+
+  // ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+
+  ncclTestEngine.getCollByteCountList(sendBytesList, recvBytesList, countList, agg_iters);
+
+  for (int i=0; i<nGpus*nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+    // TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+    for (int j = 0; j < agg_iters; j++) {
+      AllocateBuffLists(&sendbuffs[i][j], sendBytesList[j], &recvbuffs[i][j], recvBytesList[j]);
+
+      // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, ALLOCATE sendbuff @ %p, recvbuff @ %p", i, j, sendbuffs[i][j], recvbuffs[i][j]);
+    }
+  }
+
+  //if parallel init is not selected, use main thread to initialize NCCL
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+  if (!parallel_init) {
+     if (nProcs == 1) {
+       int gpuArray[nGpus*nThreads];
+       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
+     } else {
+       NCCLCHECK(ncclGroupStart());
+       for (int i=0; i<nGpus*nThreads; i++) {
+         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
+       }
+       NCCLCHECK(ncclGroupEnd());
+     }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double* delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t=0; t<nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int* sync = (int*)calloc(2, sizeof(int));
+  int* barrier = (int*)calloc(2, sizeof(int));
+  double* reduce = (double*)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread)*nThreads);
+
+  for (int t=nThreads-1; t>=0; t--) {
+    threads[t].args.minbytes=minBytes;
+    threads[t].args.maxbytes=maxBytes;
+    threads[t].args.stepbytes=stepBytes;
+    threads[t].args.stepfactor=stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs=nProcs;
+    threads[t].args.proc=proc;
+    threads[t].args.nThreads=nThreads;
+    threads[t].args.thread=t;
+    threads[t].args.nGpus=nGpus;
+    // threads[t].args.sendbuffs = sendbuffs+t*nGpus;
+    // threads[t].args.recvbuffs = recvbuffs+t*nGpus;
+    for (int j = 0; j < AGG_ITERS; j++) {
+      threads[t].args.sendbuffs[j] = sendbuffs[t][j];
+      threads[t].args.recvbuffs[j] = recvbuffs[t][j];
+    }
+    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms=comms+t*nGpus;
+    threads[t].args.streams=streams+t*nGpus;
+
+    threads[t].args.barrier = (volatile int*)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double*)reduce;
+    threads[t].args.sync = (volatile int*)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
+    threads[t].args.errors=errors+t;
+    threads[t].args.bw=bw+t;
+    threads[t].args.bw_count=bw_count+t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads+t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t=nThreads-1; t>=0; t--) {
+    if (t) pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for(int i=0; i<nGpus*nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i=0; i<nGpus*nThreads; i++) {
+    for (int j = 0; j < AGG_ITERS; j++) {
+      CUDACHECK(cudaFree((char *)sendbuffs[i][j]));
+      CUDACHECK(cudaFree((char *)recvbuffs[i][j]));
+    }
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw*(0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/src_nccl_manual_size/common_nccl_ms.h b/src_nccl_manual_size/common_nccl_ms.h
new file mode 100644
index 0000000..f671630
--- /dev/null
+++ b/src_nccl_manual_size/common_nccl_ms.h
@@ -0,0 +1,278 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "nccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+
+#define AGG_ITERS 2
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+  void (*getCollByteCountList)(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int localRank;
+  void* sendbuffs[AGG_ITERS];
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void* recvbuffs[AGG_ITERS];
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expected;
+  size_t expectedBytes;
+  volatile int* sync;
+  int sync_idx;
+  volatile int* barrier;
+  int barrier_idx;
+  volatile double* reduce;
+  int syncRank;
+  int syncNranks;
+  double* deltaHost;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+#include <chrono>
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+
+// Provided by each coll
+extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
+extern void print_header();
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64: 
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+#endif
diff --git a/src_nccl_manual_size/nccl1_compat.h b/src_nccl_manual_size/nccl1_compat.h
new file mode 100644
index 0000000..32f04e6
--- /dev/null
+++ b/src_nccl_manual_size/nccl1_compat.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <stdio.h>
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
diff --git a/src_simple/Makefile b/src_simple/Makefile
new file mode 100644
index 0000000..2206f40
--- /dev/null
+++ b/src_simple/Makefile
@@ -0,0 +1,109 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG_NT ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC = $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
+else
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
+endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG_NT), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
+endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
+LIBRARIES += nccl
+NVLDFLAGS += $(LIBRARIES:%=-l%)
+
+$(info CARDNAME $(NVCUFLAGS))
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter ofccl_reduce ofccl_broadcast
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+${DST_DIR}/%.o: %.cu common_simple.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_simple.o
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
new file mode 100644
index 0000000..fc1d809
--- /dev/null
+++ b/src_simple/common_simple.cu
@@ -0,0 +1,1534 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_simple.h"
+#include "cuda.h"
+#include "nccl.h"
+#include <cstdio>
+#include <cstring>
+#include <getopt.h>
+#include <libgen.h>
+#include <pthread.h>
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+#if NCCL_MAJOR >= 2
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8,
+                                           ncclUint8,
+                                           ncclInt32,
+                                           ncclUint32,
+                                           ncclInt64,
+                                           ncclUint64,
+                                           ncclHalf,
+                                           ncclFloat,
+                                           ncclDouble
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                           ,
+                                           ncclBfloat16
+#endif
+};
+const char *test_typenames[ncclNumTypes] = {"int8",
+                                            "uint8",
+                                            "int32",
+                                            "uint32",
+                                            "int64",
+                                            "uint64",
+                                            "half",
+                                            "float",
+                                            "double"
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                            ,
+                                            "bfloat16"
+#endif
+};
+int test_typenum = -1;
+
+const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+ncclRedOp_t test_ops[] = {
+    ncclSum,
+    ncclProd,
+    ncclMax,
+    ncclMin
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    ,
+    ncclAvg
+#endif
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    ,
+    ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+#endif
+};
+int test_opnum = -1;
+#else
+ncclDataType_t test_types[ncclNumTypes] = {
+    ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+const char *test_typenames[ncclNumTypes] = {"char",   "int",   "half",  "float",
+                                            "double", "int64", "uint64"};
+int test_typenum = 7;
+const char *test_opnames[] = {"sum", "prod", "max", "min"};
+ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32 * 1024 * 1024;
+static size_t maxBytes = 32 * 1024 * 1024;
+static size_t stepBytes = 1 * 1024 * 1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int multi_iters = 1;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static thread_local CallBackArgs cbArgList[MAX_COLL_NUM];
+static thread_local int seenCqe[MAX_COLL_NUM];
+
+// bool StringToInteger(const std::string& str, int64_t* value) {
+//   char* end;
+//   int64_t v = std::strtoll(str.data(), &end, 10);
+//   if (end == str.data()) {
+//     return false;
+//   } else {
+//     *value = v;
+//     return true;
+//   }
+// }
+
+// static int64_t ParseIntegerFromEnv(const std::string& env_var, int64_t default_value) {
+//   const char* env_p = std::getenv(env_var.c_str());
+//   if (env_p == nullptr) { return default_value; }
+//   int64_t value;
+//   if (StringToInteger(env_p, &value)) {
+//     return value;
+//   } else {
+//     return default_value;
+//   }
+// }
+
+static double parsesize(const char *value) {
+  long long int units;
+  double size;
+  char size_lit;
+
+  int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+  switch (count) {
+  case 2:
+    switch (size_lit) {
+    case 'G':
+    case 'g':
+      units = 1024 * 1024 * 1024;
+      break;
+    case 'M':
+    case 'm':
+      units = 1024 * 1024;
+      break;
+    case 'K':
+    case 'k':
+      units = 1024;
+      break;
+    default:
+      return -1.0;
+    };
+    break;
+  case 1:
+    units = 1;
+    break;
+  default:
+    return -1.0;
+  }
+
+  return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch (type) {
+  case ncclHalf:
+    return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+    return 1e-2;
+#endif
+  case ncclFloat:
+    return 1e-5;
+  case ncclDouble:
+    return 1e-12;
+  case ncclInt:
+#if NCCL_MAJOR >= 2
+  case ncclUint8:
+  // case ncclInt32:
+  case ncclUint32:
+#endif
+  case ncclInt64:
+  case ncclUint64:
+    return 1e-200;
+  }
+  return 1e-200;
+}
+
+template <typename T> __device__ double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template <> __device__ double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y - x));
+}
+
+template <typename T> __device__ float toFloat(T a) { return (float)a; }
+template <> __device__ float toFloat(half a) { return __half2float(a); }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template <typename T, int BSIZE>
+__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) {
+  const T *A = (const T *)A_;
+  const T *B = (const T *)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if (delta > locmax) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1)
+        printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i,
+               toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for (int stride = BSIZE / 2; stride > 1; stride >>= 1) {
+    __syncthreads();
+    if (tid < stride)
+      temp[tid] =
+          temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride];
+  }
+  __syncthreads();
+  if (threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template <typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep + rank + offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template <>
+__device__ double testValue<double>(const size_t offset, const int rep,
+                                    const int rank) {
+  return 1.0 / (1.0 + (double)testValue<int>(offset, rep, rank));
+}
+template <>
+__device__ float testValue<float>(const size_t offset, const int rep,
+                                  const int rank) {
+  // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
+  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  // return 1.0 / 1.0;
+}
+template <>
+__device__ half testValue<half>(const size_t offset, const int rep,
+                                const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset,
+                                                  const int rep,
+                                                  const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template <typename T> __device__ T ncclOpSum(T a, T b) { return a + b; }
+template <typename T> __device__ T ncclOpProd(T a, T b) { return a * b; }
+template <typename T> __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; }
+template <typename T> __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; }
+
+// Definitions for half
+template <> __device__ half ncclOpSum(half a, half b) {
+  return __float2half(__half2float(a) + __half2float(b));
+}
+template <> __device__ half ncclOpProd(half a, half b) {
+  return __float2half(__half2float(a) * __half2float(b));
+}
+template <> __device__ half ncclOpMax(half a, half b) {
+  return __half2float(a) > __half2float(b) ? a : b;
+}
+template <> __device__ half ncclOpMin(half a, half b) {
+  return __half2float(a) < __half2float(b) ? a : b;
+}
+
+template <typename T> __device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template <typename T> __device__ T ncclPPOpMul(T x, int arg) {
+  return x * T(arg);
+}
+template <typename T> __device__ T ncclPPOpDiv(T x, int arg) {
+  return x / T(arg);
+}
+template <> __device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x) * float(arg));
+}
+template <> __device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x) / n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x) * float(arg));
+}
+template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x) / n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; }
+
+template <typename T, T (*Op)(T, T), T (*PreOp)(T, int), T (*PostOp)(T, int)>
+__global__ void InitDataReduceKernel(T *data, const size_t N,
+                                     const size_t offset, const int rep,
+                                     const int nranks) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x) {
+    T val = testValue<T>(o + offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i = 1; i < nranks; i++) {
+      T val1 = testValue<T>(o + offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop)                                          \
+  (void *)InitDataReduceKernel<type, op<type>, preop<type>, postop<type>>
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv),               \
+      KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = {
+    OPS(int8_t),       OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t),
+    OPS(uint64_t),     OPS(half),    OPS(float),   OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void *data, const size_t count, const size_t offset,
+                            ncclDataType_t type, ncclRedOp_t op, const int rep,
+                            const int nranks) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep,
+                   (void *)&nranks};
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid,
+                             block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template <typename T>
+__global__ void InitDataKernel(T *data, const size_t N, const int rep,
+                               const int rank) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void *const initDataKerns[ncclNumTypes] = {
+    (void *)InitDataKernel<int8_t>,       (void *)InitDataKernel<uint8_t>,
+    (void *)InitDataKernel<int32_t>,      (void *)InitDataKernel<uint32_t>,
+    (void *)InitDataKernel<int64_t>,      (void *)InitDataKernel<uint64_t>,
+    (void *)InitDataKernel<half>,         (void *)InitDataKernel<float>,
+    (void *)InitDataKernel<double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    (void *)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template <typename T>
+testResult_t InitDataType(void *dest, const size_t N, const int rep,
+                          const int rank) {
+  T *ptr = (T *)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void *data, const size_t count, ncclDataType_t type,
+                      const int rep, const int rank) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank};
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs *args) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  args->barrier_idx = !args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs *args, double *value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1)
+      val += val2;
+    if (average == 2)
+      val = std::min(val, val2);
+    if (average == 3)
+      val = std::max(val, val2);
+  }
+  if (average || args->thread == 0)
+    args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1,
+                    MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1)
+      args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads;
+    args->reduce[1 - args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx = !args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams,
+                                   ncclComm_t *comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int *done = (int *)malloc(sizeof(int) * ngpus);
+  memset(done, 0, sizeof(int) * ngpus);
+  while (remaining) {
+    int idle = 1;
+    for (int i = 0; i < ngpus; i++) {
+      if (done[i])
+        continue;
+
+      cudaErr = cudaStreamQuery(streams[i]);
+      if (cudaErr == cudaSuccess) {
+        done[i] = 1;
+        remaining--;
+        idle = 0;
+        continue;
+      }
+
+      if (cudaErr != cudaErrorNotReady)
+        CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+      if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) {
+        ncclResult_t ncclAsyncErr;
+        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+        if (ncclAsyncErr != ncclSuccess) {
+          // An asynchronous error happened. Stop the operation and destroy
+          // the communicator
+          for (int i = 0; i < ngpus; i++)
+            NCCLCHECK(ncclCommAbort(comms[i]));
+          // Abort the perf test
+          NCCLCHECK(ncclAsyncErr);
+        }
+      }
+#endif
+    }
+
+    // We might want to let other threads (including NCCL threads) use the CPU.
+    if (idle)
+      pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+  if (args->nGpus != 1) {
+    OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs");
+    return testInternalError;
+  }
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
+
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    ncclRedOp_t op;
+    
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    TESTCHECK(args->collTest->prepareColl(count, type, op, root, comm, miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart");
+    NCCLCHECK(ncclGroupStart());
+  }
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm);
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(comm, &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    char *recvBuff = ((char *)args->recvbuffs[i]) + shift;
+    char *sendBuff = ((char *)args->sendbuffs[i]) + shift;
+    ncclRedOp_t op;
+
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    // miter就是collId。
+    TESTCHECK(args->collTest->runColl(
+        (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank
+                          : sendBuff),
+        (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank
+                          : recvBuff), miter, cbArgList + miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd");
+    NCCLCHECK(ncclGroupEnd());
+  }
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll)
+    Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs *args) {
+  if (blocking_coll)
+    return testSuccess;
+    
+  
+  int gotCqeCnt = 0;
+  while (gotCqeCnt < multi_iters) {
+    for (int i = 0; i < multi_iters; i++) {
+      pthread_mutex_lock(&cbArgList[i].mutex);
+      if (cbArgList[i].gotCqe == 1) {
+        if (seenCqe[i] == 0) {
+          gotCqeCnt++;
+          seenCqe[i] = 1;
+          
+          // int cudaDev;
+          // CUDACHECK(cudaGetDevice(&cudaDev));
+          // if (cudaDev == 0) {
+          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i);
+          // }
+
+        }
+      }
+      pthread_mutex_unlock(&cbArgList[i].mutex);
+    }
+  }
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) {
+
+  size_t count = args->nbytes / wordSize(type);
+
+  // Sync，参考nccl，把这个也加上吧。
+  for (int miter = 0; miter < multi_iters; miter++) {
+    seenCqe[miter] = 0;
+    TESTCHECK(startColl(args, type, op, root, in_place,
+                        0 * multi_iters + miter, miter, rankCtx));
+  }
+  TESTCHECK(completeColl(args));
+
+  Barrier(args);
+
+  // int64_t NEW_TIMER = ParseIntegerFromEnv("NEW_TIMER", 0);
+  // int64_t SHOW_ITER_TIME = ParseIntegerFromEnv("SHOW_ITER_TIME", 0);
+
+  // Performance Benchmark
+  #ifdef NEW_TIMER
+    double deltaSec = 0.0;
+  #else
+    auto start = std::chrono::high_resolution_clock::now();
+  #endif
+
+  for (int iter = 0; iter < iters; iter++) {
+
+    #if defined(NEW_TIMER) || defined(SHOW_ITER_TIME)
+      auto iter_start = std::chrono::high_resolution_clock::now();
+    #endif
+
+    for (int miter = 0; miter < multi_iters; miter++) {
+      seenCqe[miter] = 0;
+      TESTCHECK(startColl(args, type, op, root, in_place,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+
+    TESTCHECK(completeColl(args));
+    
+    #if defined(NEW_TIMER) || defined(SHOW_ITER_TIME)
+      auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start;
+      double iter_deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(iter_delta).count();
+      
+      int cudaDev;
+      cudaGetDevice(&cudaDev);
+      // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+      if (cudaDev == 0)
+        OFTEST_LOG(TEST, "Rank<%d>, iter=%d, time = %lfus", cudaDev, iter, iter_deltaSec * 1.0E6);
+    #endif
+
+    #ifdef NEW_TIMER
+      deltaSec += iter_deltaSec;
+    #endif
+  }
+
+  #ifndef NEW_TIMER
+    auto delta = std::chrono::high_resolution_clock::now() - start;
+    double deltaSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  #endif
+
+  deltaSec = deltaSec / (iters * multi_iters);
+  if (cudaGraphLaunches >= 1)
+    deltaSec = deltaSec / cudaGraphLaunches;
+  
+  #ifdef SHOW_AVG_TIME
+    int cudaDev;
+    cudaGetDevice(&cudaDev);
+    if (cudaDev == 0)
+      OFTEST_LOG(TEST, "Rank<%d>, time = %lf us, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters);
+
+    // int clockRate;
+    // cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, cudaDev);
+    // int memoryClockRate;
+    // cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, cudaDev);
+    // OFTEST_LOG(TEST, "Rank<%d>, clockRate = %d, memoryClockRate = %d", cudaDev, clockRate, memoryClockRate);
+
+    // cudaDeviceProp prop;
+    // cudaGetDeviceProperties(&prop, cudaDev);
+    // OFTEST_LOG(TEST, "Rank<%d>, prop.clockRate = %d, prop.memoryClockRate = %d", cudaDev, prop.clockRate, prop.memoryClockRate);
+  #endif
+
+  Allreduce(args, &deltaSec, average);
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw,
+                        args->nProcs * args->nThreads * args->nGpus);
+
+  Barrier(args);
+
+  ofcclDestroy(rankCtx);
+
+  double maxDelta = 0;
+  // static __thread int rep = 0; // 为了再次初始化buffer的参数，没用了。
+  // rep++;
+  if (datacheck) {
+
+    TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+    //aggregate delta from all threads and procs
+    Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec * 1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+    PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) {
+  int nranks = args->nProcs * args->nGpus * args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset,
+      recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount,
+                                   &sendInplaceOffset, &recvInplaceOffset,
+                                   (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
+                      const char *typeName, ncclRedOp_t op, const char *opName,
+                      int root, bool is_ofccl) {
+  // 首先创建ofcclRankCtx_t
+  int thrdCudaDev;
+  CUDACHECK(cudaGetDevice(&thrdCudaDev));
+  ofcclRankCtx_t rankCtx;
+  ofcclInitRankCtx(&rankCtx, thrdCudaDev);
+
+  // prepare for all size. op, type traversed in the caller.
+  // TODO: if we support multi size, each size should use a separate ncclComm
+  for (size_t size = args->minbytes; size <= args->maxbytes;
+      size = ((args->stepfactor > 1) ? size * args->stepfactor
+                                      : size + args->stepbytes)) {
+    setupArgs(size, type, args);
+    for (int miter = 0; miter < multi_iters; miter++) {
+      TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx));
+    }
+  }
+
+  // 在这里完成check数据的准备；
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
+    
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev);
+  }
+  
+  ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
+  // ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
+
+  // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
+  // warmup还是需要开，不然ofccl性能拉胯。
+  setupArgs(args->maxbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    for (int miter = 0; miter < multi_iters; miter++) {
+      seenCqe[miter] = 0;
+      TESTCHECK(startColl(args, type, op, root, 0,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+    TESTCHECK(completeColl(args));
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), thrdCudaDev, iter, multi_iters);
+  }
+
+  print_line_header(max(args->sendBytes, args->expectedBytes),
+                    args->nbytes / wordSize(type), typeName, opName, root);
+  TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
+  // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边，所以没办法在这里通过调用两次BenchTime来先做out-of-place，再做in-place。像这样的话，可以在BenchTime里加个循环。
+  PRINT("\n");
+
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadRunTests");
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank * args->nThreads * args->nGpus +
+              args->thread * args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype,
+                                   test_typenames[nccltype],
+                                   (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadInit");
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks = args->nProcs * args->nThreads * args->nGpus;
+
+  // set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+    int rank = args->proc * args->nThreads * args->nGpus +
+               args->thread * args->nGpus + i;
+    int gpuid = args->localRank * args->nThreads * args->nGpus +
+                args->thread * args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    // OFTEST_LOG1(TEST, "CommInitRank here");
+    NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i = 0; i < args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void *threadLauncher(void *thread_) {
+  struct testThread *thread = (struct testThread *)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread *thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff,
+                           size_t recvBytes, void **expected, size_t nbytes,
+                           int nranks) {
+  CUDACHECK(cudaMalloc(sendbuff, nbytes));
+  CUDACHECK(cudaMalloc(recvbuff, nbytes));
+  if (datacheck)
+    CUDACHECK(cudaMalloc(expected, recvBytes));
+  return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char *argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+  ncclGetVersion(&test_ncclVersion);
+#else
+  test_ncclVersion = NCCL_VERSION_CODE;
+#endif
+// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE,
+// test_ncclVersion);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0)
+  test_opnum = 4;
+  test_typenum = 9;
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 10, 0)) {
+    test_opnum++; // ncclAvg
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    test_typenum++; // bfloat16
+#endif
+  }
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 11, 0)) {
+    test_opnum++; // PreMulSum
+  }
+#endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+      {"nthreads", required_argument, 0, 't'},
+      {"ngpus", required_argument, 0, 'g'},
+      {"minbytes", required_argument, 0, 'b'},
+      {"maxbytes", required_argument, 0, 'e'},
+      {"stepbytes", required_argument, 0, 'i'},
+      {"stepfactor", required_argument, 0, 'f'},
+      {"iters", required_argument, 0, 'n'},
+      {"agg_iters", required_argument, 0, 'm'},
+      {"multi_iters", required_argument, 0, 'M'},
+      {"warmup_iters", required_argument, 0, 'w'},
+      {"parallel_init", required_argument, 0, 'p'},
+      {"check", required_argument, 0, 'c'},
+      {"op", required_argument, 0, 'o'},
+      {"datatype", required_argument, 0, 'd'},
+      {"root", required_argument, 0, 'r'},
+      {"blocking", required_argument, 0, 'z'},
+      {"cudagraph", required_argument, 0, 'G'},
+      {"average", required_argument, 0, 'a'},
+      {"help", no_argument, 0, 'h'},
+      {}};
+
+  while (1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts,
+                    &longindex);
+
+    if (c == -1)
+      break;
+
+    switch (c) {
+    case 't':
+      nThreads = strtol(optarg, NULL, 0);
+      break;
+    case 'g':
+      nGpus = strtol(optarg, NULL, 0);
+      break;
+    case 'b':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'minbytes'\n");
+        return -1;
+      }
+      minBytes = (size_t)parsed;
+      break;
+    case 'e':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+        return -1;
+      }
+      maxBytes = (size_t)parsed;
+      break;
+    case 'i':
+      stepBytes = strtol(optarg, NULL, 0);
+      break;
+    case 'f':
+      stepFactor = strtol(optarg, NULL, 0);
+      break;
+    case 'n':
+      iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'M':
+      multi_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+      agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+      fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+      break;
+    case 'w':
+      warmup_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'c':
+      datacheck = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'p':
+      parallel_init = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'o':
+      ncclop = ncclstringtoop(optarg);
+      break;
+    case 'd':
+      nccltype = ncclstringtotype(optarg);
+      break;
+    case 'r':
+      ncclroot = strtol(optarg, NULL, 0);
+      break;
+    case 'z':
+      blocking_coll = strtol(optarg, NULL, 0);
+      break;
+    case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) &&                \
+    CUDART_VERSION >= 11030
+      cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+      printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA "
+             "11.3. Ignoring\n");
+#endif
+      break;
+    case 'a':
+      average = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'h':
+    default:
+      if (c != 'h')
+        printf("invalid option '%c'\n", c);
+      printf("USAGE: %s \n\t"
+             "[-t,--nthreads <num threads>] \n\t"
+             "[-g,--ngpus <gpus per thread>] \n\t"
+             "[-b,--minbytes <min size in bytes>] \n\t"
+             "[-e,--maxbytes <max size in bytes>] \n\t"
+             "[-i,--stepbytes <increment size>] \n\t"
+             "[-f,--stepfactor <increment factor>] \n\t"
+             "[-n,--iters <iteration count>] \n\t"
+             "[-m,--agg_iters <aggregated iteration count>] \n\t"
+             "[-M,--multi_iters <multi seprate ncclComm iteration count>] \n\t"
+             "[-w,--warmup_iters <warmup iteration count>] \n\t"
+             "[-p,--parallel_init <0/1>] \n\t"
+             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+             "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+             "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+             "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+             "[-d,--datatype <nccltype/all>] \n\t"
+             "[-r,--root <root>] \n\t"
+             "[-z,--blocking <0/1>] \n\t"
+             "[-G,--cudagraph <num graph launches>] \n\t"
+             "[-a,--average <0/1/2/3> report average iteration time "
+             "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+             "[-h,--help]\n",
+             basename(argv[0]));
+      return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr,
+            "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+            (unsigned long long)minBytes, (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t),
+                MPI_BYTE, MPI_COMM_WORLD);
+  for (int p = 0; p < nProcs; p++) {
+    if (p == proc)
+      break;
+    if (hostHashs[p] == hostHashs[proc])
+      localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup "
+        "iters: %d iters: %d validation: %d \n",
+        nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1) ? stepFactor : stepBytes,
+        (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll)
+    PRINT("# Blocking Enabled: wait for completion and barrier after each "
+          "collective \n");
+  if (parallel_init)
+    PRINT("# Parallel Init Enabled: threads call into NcclInitRank "
+          "concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+  
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i = 0; i < nThreads * nGpus; i++) {
+    int cudaDev = localRank * nThreads * nGpus + i;
+    int rank = proc * nThreads * nGpus + i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len +=
+        snprintf(line + len, MAX_LINE - len,
+                 "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank,
+                 getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0,
+             MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines + MAX_LINE * p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for
+  // the rest.
+  size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0)
+      printf("#\n# Reducing maxBytes to %ld due to memory limitation\n",
+             maxBytes);
+  }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus * nThreads];
+  void *sendbuffs[nGpus * nThreads];
+  void *recvbuffs[nGpus * nThreads];
+  void *expected[nGpus * nThreads];
+  size_t sendBytes, recvBytes;
+
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes,
+                             (size_t)nProcs * nGpus * nThreads);
+
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+    TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes,
+                            expected + i, (size_t)maxBytes,
+                            nProcs * nThreads * nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking));
+  }
+
+  // if parallel init is not selected, use main thread to initialize NCCL
+  // TODO: assign more comms when use multi size.
+  ncclComm_t *comms =
+      (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  ncclComm_t *adjusted_comms =
+    (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  if (!parallel_init) {
+    if (nProcs == 1) {
+      int gpuArray[nGpus * nThreads];
+      for (int i = 0; i < nGpus * nThreads; i++)
+        gpuArray[i] = i;
+      // OFTEST_LOG1(TEST, "CommInitAll here");
+      // use seprate comm
+      // TODO: we do not support MPI now.
+      for (int miter = 0; miter < multi_iters; miter++) {
+        NCCLCHECK(
+          ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray));
+        for (int tid = 0; tid < nThreads; tid++) {
+          memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus);
+        }
+      }
+      
+      // for (int miter = 0; miter < multi_iters; miter++) {
+      //   for (int tid = 0; tid < nThreads; tid++) {
+      //       OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus);
+      //   }
+      // }
+      // for (int tid = 0; tid < nThreads; tid++) {
+      //   for (int miter = 0; miter < multi_iters; miter++) {
+      //     OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus);
+      //   }
+      // }
+    } else {
+      NCCLCHECK(ncclGroupStart());
+      for (int i = 0; i < nGpus * nThreads; i++) {
+        CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+        //  OFTEST_LOG1(TEST, "CommInitRank here");
+        NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId,
+                                   proc * nThreads * nGpus + i));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double *delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS,
+                          cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t = 0; t < nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int *sync = (int *)calloc(2, sizeof(int));
+  int *barrier = (int *)calloc(2, sizeof(int));
+  double *reduce = (double *)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread) * nThreads);
+
+  for (int t = nThreads - 1; t >= 0; t--) {
+    threads[t].args.minbytes = minBytes;
+    threads[t].args.maxbytes = maxBytes;
+    // TODO: 不支持多个size。
+    if (minBytes != maxBytes) {
+      OFTEST_LOG1(TEST_FATAL, "Only supports single size now");
+      return testInternalError;
+    }
+    threads[t].args.stepbytes = stepBytes;
+    threads[t].args.stepfactor = stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs = nProcs;
+    threads[t].args.proc = proc;
+    threads[t].args.nThreads = nThreads;
+    threads[t].args.thread = t;
+    threads[t].args.nGpus = nGpus;
+    threads[t].args.sendbuffs = sendbuffs + t * nGpus;
+    threads[t].args.recvbuffs = recvbuffs + t * nGpus;
+    threads[t].args.expected = expected + t * nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus;
+    // for (int i = 0; i < multi_iters * nGpus; i++) {
+    //   OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i);
+    // }
+
+    threads[t].args.streams = streams + t * nGpus;
+
+    threads[t].args.barrier = (volatile int *)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double *)reduce;
+    threads[t].args.sync = (volatile int *)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t * NUM_BLOCKS);
+    threads[t].args.errors = errors + t;
+    threads[t].args.bw = bw + t;
+    threads[t].args.bw_count = bw_count + t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads + t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t = nThreads - 1; t >= 0; t--) {
+    if (t)
+      pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for (int i = 0; i < nGpus * nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    if (sendbuffs[i])
+      CUDACHECK(cudaFree((char *)sendbuffs[i]));
+    if (recvbuffs[i])
+      CUDACHECK(cudaFree((char *)recvbuffs[i]));
+    if (datacheck)
+      CUDACHECK(cudaFree(expected[i]));
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char *str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0],
+        errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0],
+        check_avg_bw == -1 ? ""
+                           : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw * (0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
new file mode 100644
index 0000000..daba610
--- /dev/null
+++ b/src_simple/common_simple.h
@@ -0,0 +1,295 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "nccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+
+// 环境变量是方便，但是会多一些判断，可能影响性能。
+// #define DEBUG_PRINT 1
+
+// #define NEW_TIMER 1
+// #define SHOW_ITER_TIME 1
+#define SHOW_AVG_TIME 1
+
+#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
+#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
+#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__)
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+typedef struct {
+  int collId;
+  int gotCqe;
+  // int cqeCnt;
+  pthread_mutex_t mutex;
+} CallBackArgs;
+
+#define MAX_COLL_NUM 10000
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx);
+  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int localRank;
+  void** sendbuffs;
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void** recvbuffs;
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expected;
+  size_t expectedBytes;
+  volatile int* sync;
+  int sync_idx;
+  volatile int* barrier;
+  int barrier_idx;
+  volatile double* reduce;
+  int syncRank;
+  int syncNranks;
+  double* deltaHost;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+#include <chrono>
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root, bool is_ofccl=false);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+
+// Provided by each coll
+extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
+extern void print_header();
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64: 
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+#endif
diff --git a/src_simple/nccl1_compat.h b/src_simple/nccl1_compat.h
new file mode 100644
index 0000000..020a4bc
--- /dev/null
+++ b/src_simple/nccl1_compat.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
diff --git a/src_simple/ofccl_all_gather.cu b/src_simple/ofccl_all_gather.cu
new file mode 100644
index 0000000..6cf8ddf
--- /dev/null
+++ b/src_simple/ofccl_all_gather.cu
@@ -0,0 +1,151 @@
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s", size, count, typeName);
+}
+
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count/nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = count/nranks;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId);
+
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+  return 0;
+}
+
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+  NCCLCHECK(ofcclRunAllGather(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllGather for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllGather sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareAllGather(count, datatype, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllGather with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl allGatherTest = {
+  "AllGather",
+  AllGatherGetCollByteCount,
+  AllGatherInitData,
+  AllGatherGetBw,
+  AllGatherRunColl,
+  AllGatherPrepare
+};
+
+void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allGatherTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allGatherEngine = {
+  AllGatherGetBuffSize,
+  AllGatherRunTest
+};
+
+#pragma weak ncclTestEngine=allGatherEngine
+
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
new file mode 100644
index 0000000..7dd65d9
--- /dev/null
+++ b/src_simple/ofccl_all_reduce.cu
@@ -0,0 +1,160 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev);
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
+  return 0;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+
+  // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+
+  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl,
+  AllReducePrepare
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
+
+#pragma weak ncclTestEngine=allReduceEngine
diff --git a/src_simple/ofccl_broadcast.cu b/src_simple/ofccl_broadcast.cu
new file mode 100644
index 0000000..4a2b217
--- /dev/null
+++ b/src_simple/ofccl_broadcast.cu
@@ -0,0 +1,146 @@
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
+}
+
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId);
+
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+  return 0;
+}
+
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+  NCCLCHECK(ofcclRunBroadcast(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunBroadcast for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunBroadcast sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t BroadcastPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareBroadcast(count, datatype, root, comm, collId, rankCtx));
+  OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareBroadcast with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl broadcastTest = {
+  "Broadcast",
+  BroadcastGetCollByteCount,
+  BroadcastInitData,
+  BroadcastGetBw,
+  BroadcastRunColl,
+  BroadcastPrepare
+};
+
+void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &broadcastTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine broadcastEngine = {
+  BroadcastGetBuffSize,
+  BroadcastRunTest
+};
+
+#pragma weak ncclTestEngine=broadcastEngine
+
+
diff --git a/src_simple/ofccl_reduce.cu b/src_simple/ofccl_reduce.cu
new file mode 100644
index 0000000..33db29c
--- /dev/null
+++ b/src_simple/ofccl_reduce.cu
@@ -0,0 +1,159 @@
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s  %6i", size, count, typeName, opName, root);
+}
+
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  *busBw = baseBw;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId);
+
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+  return 0;
+}
+
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+  NCCLCHECK(ofcclRunReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t ReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareReduce(count, datatype, op, root, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareReduce with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl reduceTest = {
+  "Reduce",
+  ReduceGetCollByteCount,
+  ReduceInitData,
+  ReduceGetBw,
+  ReduceRunColl,
+  ReducePrepare
+};
+
+void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      for (int k=begin_root; k<=end_root; k++) {
+        TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
+      }
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine reduceEngine = {
+  ReduceGetBuffSize,
+  ReduceRunTest
+};
+
+#pragma weak ncclTestEngine=reduceEngine
+
+
diff --git a/src_simple/ofccl_reduce_scatter.cu b/src_simple/ofccl_reduce_scatter.cu
new file mode 100644
index 0000000..84d99bc
--- /dev/null
+++ b/src_simple/ofccl_reduce_scatter.cu
@@ -0,0 +1,153 @@
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = *recvcount;
+}
+
+testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId);
+
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+  return 0;
+}
+
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+  NCCLCHECK(ofcclRunReduceScatter(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduceScatter for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduceScatter sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t ReduceScatterPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareReduceScatter(count, datatype, op, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareReduceScatter with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl reduceScatterTest = {
+  "ReduceScatter",
+  ReduceScatterGetCollByteCount,
+  ReduceScatterInitData,
+  ReduceScatterGetBw,
+  ReduceScatterRunColl,
+  ReduceScatterPrepare
+};
+
+void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceScatterTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine reduceScatterEngine = {
+  ReduceScatterGetBuffSize,
+  ReduceScatterRunTest
+};
+
+#pragma weak ncclTestEngine=reduceScatterEngine
+
+
+
diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
new file mode 100644
index 0000000..1f8c9f1
--- /dev/null
+++ b/test_scripts/auto_test.py
@@ -0,0 +1,443 @@
+import os 
+
+import xlwt
+# 设置字体大小
+style = xlwt.XFStyle()
+font = xlwt.Font()
+font.height = 20*16
+style.font = font
+# 设置环境变量
+#os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib"
+os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib"
+os.environ['NCCL_PROTO'] = "Simple"
+os.environ['NCCL_ALGO'] = "RING"
+
+os.environ['TRAVERSE_TIMES'] = "10"
+os.environ['TOLERANT_UNPROGRESSED_CNT'] = "10000"
+os.environ['BASE_CTX_SWITCH_THRESHOLD'] = "80"
+os.environ['BOUNS_SWITCH_4_PROCESSED_COLL'] = "0"
+os.environ['DEV_TRY_ROUND'] = "10"
+
+# 设置超参数
+runNcclTest = True # 运行nccl测试,仅输出原始结果
+staticNccl = True # 运行统计，输出中间结果
+collectNcclResult  = True# 收集nccl测试结果，写入xls
+
+
+runOfcclTest = True# 运行ofccl测试
+staticOfccl = True # 运行统计，输出中间结果
+staticOfcclExtral = True# 对ofccl的额外输出进行统计
+collectOfcclResult = True# 收集ofccl测试结果，写入xls
+
+buffer_sizes = ["64", "128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]
+
+TINY_TEST = 0
+
+DATE="230118"
+NCCL_ORDER="1"
+host=os.environ.get("HOST")
+n = 5
+m = 1 #nccl
+w = 2
+M = 1 #ofccl
+if host=="oneflow-15" or host=="oneflow-16":
+    NUM_DEV = 4#设备的总卡数，实验用到的卡数写在循环里
+    ncards = [2,4]
+else:
+    NUM_DEV = 8
+    ncards = [2,4,8]
+
+if TINY_TEST == 1:
+    runNcclTest = False # 运行nccl测试,仅输出原始结果
+    staticNccl = False # 运行统计，输出中间结果
+    collectNcclResult  = False # 收集nccl测试结果，写入xls
+    ncards = [2]
+    # buffer_sizes = ["64", "128", "256", "512", "1K"]
+NCCL_TIER=[1,2,3]
+OFCCL_ITER=[1,2,3,4,5,6]
+resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+str(m)+"n"+str(n)+"w"+str(w)+".xls"
+
+# static 
+os.system("g++ ./nccl/static_nccl.cpp -o ./nccl/static_nccl.out")
+os.system("g++ ./nccl/static_time.cpp -o ./nccl/static_time.out")
+os.system("g++ ./ofccl/static_ofccl_time.cpp -o ./ofccl/static_ofccl_time.out")
+os.system("g++ ./ofccl/static_ofccl_bw.cpp -o ./ofccl/static_ofccl_bw.out")
+os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out")
+os.system("g++ ./ofccl/static_ofccl_QE_ori.cpp -o ./ofccl/static_ofccl_QE_ori.out")
+os.system("g++ ./ofccl/static_ofccl_totalCnt.cpp -o ./ofccl/static_ofccl_totalCnt.out")
+os.system("g++ ./ofccl/static_ofccl_bw_order.cpp -o ./ofccl/static_ofccl_bw_order.out ")
+os.system("g++ ./ofccl/static_ofccl_tm_order.cpp -o ./ofccl/static_ofccl_tm_order.out ")
+AR = {}
+AG = {}
+B = {}
+R = {}
+RS = {}
+
+table = xlwt.Workbook()
+AR['bwSheet'] = table.add_sheet('allReduce_bw')
+AR['tmSheet'] = table.add_sheet('allReduce_time')
+AR['cntSheet'] = table.add_sheet('allReduce_totalCnt')
+AR['run'] = "../build/all_reduce_perf"
+AR['runOfccl'] = "../build/ofccl_all_reduce_perf"
+
+AG['bwSheet'] = table.add_sheet('allGather_bw')
+AG['tmSheet'] = table.add_sheet('allGather_time')
+AG['cntSheet'] = table.add_sheet('allGather_totalCnt')
+AG['run'] = "../build/all_gather_perf"
+AG['runOfccl'] = "../build/ofccl_all_gather_perf"
+
+B['bwSheet'] = table.add_sheet('broadcast_bw')
+B['tmSheet'] = table.add_sheet('broadcast_time')
+B['cntSheet'] = table.add_sheet('broadcast_totalCnt')
+B['run'] = "../build/broadcast_perf"
+B['runOfccl']="../build/ofccl_broadcast_perf"
+
+R['bwSheet'] = table.add_sheet('reduce_bw')
+R['tmSheet'] = table.add_sheet('reduce_time')
+R['cntSheet'] = table.add_sheet('reduce_totalCnt')
+R['run'] = "../build/reduce_perf"
+R['runOfccl']= "../build/ofccl_reduce_perf"
+
+RS['bwSheet'] = table.add_sheet('reduceScatter_bw')
+RS['tmSheet'] = table.add_sheet('reduceScatter_time')
+RS['cntSheet'] = table.add_sheet('reduceScatter_totalCnt')
+RS['run'] = "../build/reduce_scatter_perf"
+RS['runOfccl'] = "../build/ofccl_reduce_scatter_perf"
+# 列宽
+# for i in range(30):
+#     AR['bwSheet'].col(i).width = 13 * 256
+#     AR_tmSheet.col(i).width = 16 * 256
+
+cnt  = 0
+for MY_NUM_DEV in ncards:
+
+    if 'CUDA_VISIBLE_DEVICES' in os.environ:
+        del os.environ['CUDA_VISIBLE_DEVICES']
+    if MY_NUM_DEV == 4 and NUM_DEV == 8:
+        os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,4,5"
+    # nccl
+    # 创建存放实验结果的文件夹
+    NCCL_RES_DIR ="./nccl/test_result_"+DATE+"_"+NCCL_ORDER+"_"+str(MY_NUM_DEV)+"cards"
+    if not os.path.exists(NCCL_RES_DIR):
+        os.makedirs(NCCL_RES_DIR)
+    # 统计结果  
+    # allReduce  
+    AR['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_allReduce_"+str(MY_NUM_DEV)+"cards.txt"  
+    AR['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_allReduce_"+str(MY_NUM_DEV)+"cards_time.txt"   
+    # allGather
+    AG['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_allGather_"+str(MY_NUM_DEV)+"cards.txt"  
+    AG['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_allGather_"+str(MY_NUM_DEV)+"cards_time.txt"   
+    # broadcast
+    B['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_broadcast_"+str(MY_NUM_DEV)+"cards.txt"  
+    B['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_broadcast_"+str(MY_NUM_DEV)+"cards_time.txt"   
+    # reduce
+    R['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_reduce_"+str(MY_NUM_DEV)+"cards.txt"  
+    R['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_reduce_"+str(MY_NUM_DEV)+"cards_time.txt"   
+    # reduceScatter
+    RS['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_reduceScatter_"+str(MY_NUM_DEV)+"cards.txt"  
+    RS['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_time.txt"      
+
+    if staticNccl == True:
+        for op in [AR,AG,B,R,RS]:
+            os.system("echo  $(date +%F%n%T)>>"+op['nccl_bw_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['nccl_time_path'])
+
+
+    for iter in NCCL_TIER:
+        # raw data
+        AR['nccl_rawData'] = NCCL_RES_DIR+"/nccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        AG['nccl_rawData'] = NCCL_RES_DIR+"/nccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        B['nccl_rawData'] = NCCL_RES_DIR+"/nccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        R['nccl_rawData'] = NCCL_RES_DIR+"/nccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        RS['nccl_rawData'] = NCCL_RES_DIR+"/nccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+
+        if runNcclTest:
+            for op in [AR,AG,B,R,RS]:
+                os.system("echo $(date +%F%n%T)>> "+op['nccl_rawData'])
+
+                for a in buffer_sizes:
+                    os.system(op['run']+" -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ op['nccl_rawData'])
+
+        if staticNccl:
+            for op in [AR,AG,B,R,RS]:    
+                os.system("./nccl/static_nccl.out " +op['nccl_rawData'] +" " +op['nccl_bw_path']) 
+                os.system("./nccl/static_time.out " +op['nccl_rawData'] +" " +op['nccl_time_path'])
+
+            
+                   
+    if collectNcclResult :
+        for op in [AR,AG,B,R,RS]:
+            # bus
+            op['bwSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+
+            with open(op['nccl_bw_path']) as f:
+                content = f.read()
+            bw = content.split()
+
+            axis_y =  buffer_sizes
+            for a in range(0,25):
+                op['bwSheet'].write(2+a+cnt*30,0,axis_y[a],style)                 
+            #
+            for k in [0,1,2]:
+                op['bwSheet'].write(1+cnt*30,1+k,'nccl-algbw'+str(k),style)
+                for i in range(0,25):
+                    op['bwSheet'].write(2+i+cnt*30,1+k,bw[i+k*50+2],style)
+
+                op['bwSheet'].write(1+cnt*30,12+k,'nccl-busbw'+str(k),style)
+                for i in range(0,25):
+                    op['bwSheet'].write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style)
+            # avg
+            op['bwSheet'].write(1+cnt*30, 4, 'avg-algbw',style)
+            op['bwSheet'].write(1+cnt*30, 15, 'avg-busbw',style)
+            for i in range(0,25):
+                op['bwSheet'].write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style )
+                op['bwSheet'].write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) 
+            
+            # time  
+            with open(op['nccl_time_path']) as f2:
+                content2 = f2.read()
+            times = content2.split()
+
+            op['tmSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+            for a in range(0,25):
+                op['tmSheet'].write(2+a+cnt*30,0,axis_y[a],style)
+            for k in [0,1,2]:
+                op['tmSheet'].write(1+cnt*30,1+k,'nccl-'+str(k),style)
+                for i in range(0,25):
+                    op['tmSheet'].write(2+i+cnt*30,1+k,times[i+k*25+2],style)
+            # avg 
+            op['tmSheet'].write(1+cnt*30, 4, 'avg-nccl',style)
+            for i in range(0,25):
+                op['tmSheet'].write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style)
+        
+
+    #OFCCL      
+    # 创建存放实验结果的文件夹
+    OFCCL_RES_DIR ="./ofccl/test_result_"+DATE+"_"+NCCL_ORDER+"_"+str(MY_NUM_DEV)+"cards"
+    if not os.path.exists(OFCCL_RES_DIR):
+        os.makedirs(OFCCL_RES_DIR)
+    # 统计结果    
+    AR['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards.txt"  
+    AR['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_order_"+str(MY_NUM_DEV)+"cards.txt"  
+    AR['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    AR['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_order_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    AR['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    AR['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    AR['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
+
+    AG['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards.txt"  
+    AG['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_order_"+str(MY_NUM_DEV)+"cards.txt"
+    AG['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    AG['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_order_"+str(MY_NUM_DEV)+"cards_time.txt"
+    AG['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    AG['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    AG['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
+
+    B['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards.txt" 
+    B['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_order_"+str(MY_NUM_DEV)+"cards.txt"  
+    B['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    B['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_order_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    B['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    B['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    B['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
+
+    R['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards.txt"  
+    R['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_order_"+str(MY_NUM_DEV)+"cards.txt" 
+    R['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    R['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_order_"+str(MY_NUM_DEV)+"cards_time.txt" 
+    R['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    R['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    R['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
+
+    RS['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards.txt"  
+    RS['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_order_"+str(MY_NUM_DEV)+"cards.txt" 
+    RS['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_time.txt" 
+    RS['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_order_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    RS['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    RS['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    RS['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"   
+
+    if staticOfccl: 
+        for op in [AR,AG,B,R,RS]:
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_bw_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_tm_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_bw_order_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_tm_order_path'])
+    if staticOfcclExtral:
+        for op in [AR,AG,B,R,RS]:
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_qe_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_qeOri_path'])
+       
+
+    for iter in OFCCL_ITER:
+        # raw data
+        AR['ofccl_rawData'] =  OFCCL_RES_DIR+"/ofccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        AG['ofccl_rawData'] =OFCCL_RES_DIR+"/ofccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        B['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        R['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        RS['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+       
+        if runOfcclTest:
+            for op in [AR,AG,B,R,RS]:
+                os.system("echo $(date +%F%n%T)>> "+op['ofccl_rawData'])
+                for a in buffer_sizes:
+                    os.system(op['runOfccl']+"  -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ op['ofccl_rawData'])
+        if staticOfccl:
+            for op in [AR,AG,B,R,RS]:
+                os.system("./ofccl/static_ofccl_bw.out " +op['ofccl_rawData']+" " +op['ofccl_bw_path']) 
+                os.system("./ofccl/static_ofccl_time.out " +op['ofccl_rawData']+" " + op['ofccl_tm_path'])
+        if staticOfcclExtral:
+            for op in [AR,AG,B,R,RS]:
+                os.system("./ofccl/static_ofccl_QE.out " +op['ofccl_rawData']+" " + op['ofccl_qe_path'])
+                os.system("./ofccl/static_ofccl_QE_ori.out " +op['ofccl_rawData']+" " + op['ofccl_qeOri_path'])
+                os.system("./ofccl/static_ofccl_totalCnt.out "+op['ofccl_rawData']+" " + op['ofccl_totalCnt_path'])
+    if staticOfccl:
+        for op in [AR,AG,B,R,RS]:
+            os.system("./ofccl/static_ofccl_bw_order.out "+op['ofccl_bw_path']+" "+op['ofccl_bw_order_path']+" "+ str(len(OFCCL_ITER)))
+            os.system("./ofccl/static_ofccl_tm_order.out "+op['ofccl_tm_path']+" "+op['ofccl_tm_order_path']+" "+ str(len(OFCCL_ITER)))
+
+
+    if collectOfcclResult == True:
+        #bus width
+        for op in [AR,AG,B,R,RS]:
+            with open(op['ofccl_bw_order_path']) as f2:
+                content2 = f2.read()
+            bw = content2.split()
+                    
+            for k in [0,1,2]:
+                op['bwSheet'].write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style)
+                for i in range(0,25):
+                    op['bwSheet'].write(2+i+cnt*30,5+k,bw[i+k*50+2],style)
+
+                op['bwSheet'].write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style)
+                for i in range(0,25):
+                    op['bwSheet'].write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style)
+            # avg
+            op['bwSheet'].write(1+cnt*30,8, 'avg-algbw',style)
+            op['bwSheet'].write(1+cnt*30, 19, 'avg-busbw',style)
+            for i in range(0,25):
+                op['bwSheet'].write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
+                op['bwSheet'].write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) 
+            
+            # time  
+            with open(op['ofccl_tm_order_path']) as f2:
+                content2 = f2.read()
+            times = content2.split()
+
+            for k in [0,1,2]:
+                op['tmSheet'].write(1+cnt*30,5+k,'ofccl-'+str(k),style)
+                for i in range(0,25):
+                    op['tmSheet'].write(2+i+cnt*30,5+k,times[i+k*25+2],style)
+            # avg 
+            op['tmSheet'].write(1+cnt*30, 4+4, 'avg-ofccl',style)
+            for i in range(0,25):
+                op['tmSheet'].write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
+
+    if collectNcclResult and collectOfcclResult:
+        for op in [AR,AG,B,R,RS]:
+            op['bwSheet'].write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style)
+            op['bwSheet'].write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style)
+            op['tmSheet'].write(1+cnt*30, 9, 'ofccl-nccl',style)
+            op['tmSheet'].write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style)
+            for i in range(0,25):
+                op['bwSheet'].write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style)
+                op['bwSheet'].write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style)
+                op['tmSheet'].write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style )
+                op['tmSheet'].write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style )
+
+    # time 各个列的标题
+    if staticOfcclExtral:
+        for op in [AR,AG,B,R,RS]:
+            op['tmSheet'].write(1+cnt*30, 13,'nccl IO',style )
+            op['tmSheet'].write(1+cnt*30, 14,'nccl kern',style )
+            op['tmSheet'].write(1+cnt*30, 15,'ofccl-nccl kern',style )
+            op['tmSheet'].write(1+cnt*30, 16,'before after get sqe',style )
+            op['tmSheet'].write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style )
+            op['tmSheet'].write(1+cnt*30, 18,'before after put cqe',style )
+            op['tmSheet'].write(1+cnt*30, 19,'beforeSqe TO afterCqe',style )
+            op['tmSheet'].write(1+cnt*30, 20,'occl rank0 time',style )
+            op['tmSheet'].write(1+cnt*30, 21,'nccl kern ori',style )
+            op['tmSheet'].write(1+cnt*30, 27,'before after get sqe ori',style )
+            op['tmSheet'].write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style )
+            op['tmSheet'].write(1+cnt*30, 39,'before after put cqe ori',style )
+            op['tmSheet'].write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style )
+
+            y = 64
+            for i in range(0,25):
+                op['tmSheet'].write(2+i+cnt*30,12,y,style)
+                y = y*2    
+
+            with open(op['ofccl_qe_path']) as f3:
+                content3 = f3.read()
+            times = content3.split()
+            with open(op['ofccl_qeOri_path']) as f4:
+                content4 = f4.read()
+            times4 = content4.split()
+            for i in range(0,25):
+                op['tmSheet'].write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
+                op['tmSheet'].write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style )
+                op['tmSheet'].write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
+                op['tmSheet'].write(2+cnt*30+i,16,times[2+125*cnt+i],style)
+                op['tmSheet'].write(2+cnt*30+i,17,times[2+125*cnt+25+i],style)
+                op['tmSheet'].write(2+cnt*30+i,18,times[2+125*cnt+50+i],style)
+                op['tmSheet'].write(2+cnt*30+i,19,times[2+125*cnt+75+i],style)
+                op['tmSheet'].write(2+cnt*30+i,20,times[2+125*cnt+100+i],style)
+                for j in range(0,5):
+                    op['tmSheet'].write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style)
+                    op['tmSheet'].write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style)
+                    op['tmSheet'].write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style)
+                    op['tmSheet'].write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style)
+
+            # cntSheet
+            op['cntSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+            axis_y =  buffer_sizes
+            for a in range(0,25):
+                op['cntSheet'].write(2+a+cnt*30,0,axis_y[a],style)
+
+            op['cntSheet'].write(1+cnt*30,1,"totalCtxSaveCnt_avg",style)
+            op['cntSheet'].write(1+cnt*30,2,"totalCtxLoadCnt_avg",style)
+            op['cntSheet'].write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style)
+            op['cntSheet'].write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style)
+            op['cntSheet'].write(1+cnt*30,6,"totalCtxSaveCnt",style)
+            op['cntSheet'].write(1+cnt*30,24,"totalCtxLoadCnt",style)
+            op['cntSheet'].write(1+cnt*30,42,"totalProgressed7SwithchCnt",style)
+            op['cntSheet'].write(1+cnt*30,60,"totalUnprogressedQuitCnt",style)
+
+            with  open(op['ofccl_totalCnt_path']) as f:
+                line = f.readline()
+                # save
+                for i in range(0,25): 
+                    numbers = line.split()
+                    op['cntSheet'].write(i+2+cnt*30,1,numbers[0])
+                    for j in range(1,len(numbers)):
+                        op['cntSheet'].write(i+2+cnt*30,5+j,numbers[j])
+                    line = f.readline()
+                # load
+                for i in range(0,25): 
+                    numbers = line.split()
+                    op['cntSheet'].write(i+2+cnt*30,2,numbers[0])
+                    for j in range(1,len(numbers)):
+                        op['cntSheet'].write(i+2+cnt*30,23+j,numbers[j])
+                    line = f.readline()
+                # totalProgressed7SwithchCnt
+                for i in range(0,25): 
+                    numbers = line.split()
+                    op['cntSheet'].write(i+2+cnt*30,3,numbers[0])
+                    for j in range(1,len(numbers)):
+                        op['cntSheet'].write(i+2+cnt*30,41+j,numbers[j])
+                    line = f.readline()
+                # totalUnprogressedQuitCnt
+                for i in range(0,25): 
+                    numbers = line.split()
+                    op['cntSheet'].write(i+2+cnt*30,4,numbers[0])
+                    for j in range(1,len(numbers)):
+                        op['cntSheet'].write(i+2+cnt*30,59+j,numbers[j])
+                    line = f.readline()
+
+
+
+    cnt = cnt+1
+
+# 保存 excel
+if collectNcclResult or collectOfcclResult:
+    table.save(resultXlsName)
\ No newline at end of file
diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh
new file mode 100755
index 0000000..890e045
--- /dev/null
+++ b/test_scripts/nccl/run_nccl.sh
@@ -0,0 +1,42 @@
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
+
+export DATE=221228
+export NCCL_ORDER=1
+
+for MY_NUM_DEV in 2 4 8
+do
+    unset CUDA_VISIBLE_DEVICES
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export RES_DIR=result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
+    if [ ! -d "$RES_DIR" ]; then 
+        mkdir $RES_DIR
+    fi
+
+    for n in 5
+    do
+        for w in  2 
+        do
+            for m in 1
+            do
+                for iter in 1
+                do
+                export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
+                ## Time
+                echo $(date +%F%n%T)>> $RES_PATH
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G
+                    do
+                    ## Test
+                    /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH
+                    done
+                done 
+            done
+        done
+    done
+done
diff --git a/test_scripts/nccl/static_nccl.cpp b/test_scripts/nccl/static_nccl.cpp
new file mode 100644
index 0000000..911fd0c
--- /dev/null
+++ b/test_scripts/nccl/static_nccl.cpp
@@ -0,0 +1,42 @@
+#include"bits/stdc++.h"
+#include <string>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+
+   string inputLine;
+    vector<string> a;
+    vector<string> b;
+    string ss="bandwidth";
+    string str = "N/A";
+    int cnt = 0;
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
+
+        stringstream line;
+        line << inputLine;
+        string tmp;
+        stack<string> ss;
+        while(line >> tmp){
+            ss.push(tmp);
+        }
+        ss.pop();
+        b.push_back(ss.top());
+        ss.pop();
+        a.push_back(ss.top());
+        
+        if(++cnt == 25)
+            break;
+    }
+
+    for(auto a1:a)
+        cout<<a1<<endl;
+
+    cout <<endl;
+    for(auto b1:b)
+        cout<<b1<<endl;
+    cout << endl;
+}
\ No newline at end of file
diff --git a/test_scripts/nccl/static_time.cpp b/test_scripts/nccl/static_time.cpp
new file mode 100644
index 0000000..0d9d7b4
--- /dev/null
+++ b/test_scripts/nccl/static_time.cpp
@@ -0,0 +1,39 @@
+#include"bits/stdc++.h"
+#include <string>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+
+    string inputLine;
+    vector<string> a;
+    vector<string> b;
+    string ss="bandwidth";
+    string str = "N/A";
+    int cnt = 0;
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
+
+        stringstream line;
+        line << inputLine;
+        string tmp;
+        stack<string> ss;
+        while(line >> tmp){
+            ss.push(tmp);
+        }
+        ss.pop();
+        ss.pop();
+        ss.pop();
+        a.push_back(ss.top());
+        
+        if(++cnt == 25)
+            break;
+    }
+
+    for(auto a1:a)
+        cout<<a1<<endl;
+
+    cout <<endl<< endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/clear_static_ofccl.cpp b/test_scripts/ofccl/clear_static_ofccl.cpp
new file mode 100644
index 0000000..2881772
--- /dev/null
+++ b/test_scripts/ofccl/clear_static_ofccl.cpp
@@ -0,0 +1,39 @@
+#include"bits/stdc++.h"
+#include <string>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    int ranks = *(argv[3]) - '0';
+    string str;
+    stringstream ss;
+    vector<string> a;
+    vector<string> b;
+    string line;
+    // time
+    getline(cin,line);
+
+    for(int t =0;t < 25;t++){
+        for(int i = 0;i < (11+ranks);i++)
+            getline(cin,line);
+        
+        for(int i =0;i < 6;i++)
+            cin >> str;
+
+        a.push_back(str);
+        cin >> str;
+        b.push_back(str);
+
+        
+        for(int i = 0;i < 4;i++)
+            getline(cin,line);        
+        
+    }
+    for(int i=0;i<a.size();i++)
+        cout << a[i] <<endl;
+    cout << endl;
+    for(int i=0;i<b.size();i++)
+        cout << b[i] <<endl;
+    cout<<endl<<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/clear_static_ofccl_time.cpp b/test_scripts/ofccl/clear_static_ofccl_time.cpp
new file mode 100644
index 0000000..bcefbb8
--- /dev/null
+++ b/test_scripts/ofccl/clear_static_ofccl_time.cpp
@@ -0,0 +1,35 @@
+#include"bits/stdc++.h"
+#include <string>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    int ranks = *(argv[3]) - '0';
+    string str;
+    stringstream ss;
+    vector<string> a;
+    vector<string> b;
+    string line;
+    // time
+    getline(cin,line);
+
+    for(int t =0;t < 25;t++){
+        for(int i = 0;i < (11+ranks);i++)
+            getline(cin,line);
+        
+        for(int i =0;i < 5;i++)
+            cin >> str;
+
+        a.push_back(str);
+       
+        for(int i = 0;i < 4;i++)
+            getline(cin,line);        
+        
+    }
+    for(int i=0;i<a.size();i++)
+        cout << a[i] <<endl;
+
+    
+    cout<<endl<<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/run_ofccl.sh b/test_scripts/ofccl/run_ofccl.sh
new file mode 100755
index 0000000..3be6bf7
--- /dev/null
+++ b/test_scripts/ofccl/run_ofccl.sh
@@ -0,0 +1,50 @@
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
+
+export DATE=221225
+export NCCL_ORDER=3
+
+export TRAVERSE_TIMES=10
+export TOLERANT_UNPROGRESSED_CNT=10000
+export BASE_CTX_SWITCH_THRESHOLD=80
+export BOUNS_SWITCH_4_PROCESSED_COLL=0
+export DEV_TRY_ROUND=10
+
+# export SHOW_ALL_PREPARED_COLL=1
+
+for MY_NUM_DEV in 2 4 8
+do
+    unset CUDA_VISIBLE_DEVICES
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export RES_DIR=result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
+    if [ ! -d "$RES_DIR" ]; then 
+        mkdir $RES_DIR
+    fi
+
+    for n in 5
+    do
+        for w in  2 
+        do
+            for m in 1
+            do
+                for iter in 1
+                do
+                export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
+                ## Time
+                echo $(date +%F%n%T)>> $RES_PATH
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G
+                    do
+                    ## Test
+                    /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH
+                    done
+                done 
+            done
+        done
+    done
+done
diff --git a/test_scripts/ofccl/static_ofccl_QE.cpp b/test_scripts/ofccl/static_ofccl_QE.cpp
new file mode 100644
index 0000000..3705bdb
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_QE.cpp
@@ -0,0 +1,174 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+    string inputLine;
+    vector<string> time;
+    vector<double> sqe;
+    vector<double> beforeCqe;
+    vector<double> putCqe;
+    vector<double> afterCqe;
+    string bw="bandwidth";
+
+    int cnt = 0;
+    double sqe_sum = 0;
+    int sqe_cnt = 0;
+
+    double beforeCqe_sum=0;
+    int beforeCqe_cnt = 0;
+
+    double putCqe_sum = 0;
+    int putCqe_cnt = 0;
+
+    double afterCqe_sum = 0;
+    int afterCqe_cnt = 0;
+
+    while(getline(cin, inputLine)){
+        if(inputLine.find(bw,0) != -1){
+            // 判断结束一个输出
+            // before after get sqe
+            double sqe_avg = sqe_sum / sqe_cnt;
+            sqe.push_back(sqe_avg);
+            sqe_sum = 0;
+            sqe_cnt =0;
+            // AfterSqe TO BeforeCqe
+            double beforeCqe_avg = beforeCqe_sum / beforeCqe_cnt;
+            beforeCqe.push_back(beforeCqe_avg);
+            beforeCqe_sum =0;
+            beforeCqe_cnt =0;
+            //before after put cqe
+            double putCqe_avg = putCqe_sum / putCqe_cnt;
+            putCqe.push_back(putCqe_avg);
+            putCqe_sum = 0;
+            putCqe_cnt = 0;
+            //beforeSqe TO afterCqe
+            double afterCqe_avg = afterCqe_sum/afterCqe_cnt;
+            afterCqe.push_back(afterCqe_avg);
+            afterCqe_sum=0;
+            afterCqe_cnt=0;
+
+            if(++cnt == 25)
+            break;
+        }
+        // rank0 time
+        int pos = -1;
+        if ((pos=inputLine.find("time = ",0) ) != -1){
+            pos += 7;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos];
+                pos++;
+            }
+            time.push_back(t);
+            continue;
+        }
+
+        // before after get sqe
+        if ((pos=inputLine.find("before after get sqe AVG",0) ) != -1){
+            pos += 27;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos]; 
+                pos++;
+            }
+            stringstream ss;
+            double tt;
+            ss << t;
+            ss >> tt;
+            pos=inputLine.find("weight = ",0);
+            pos +=9;
+            int count = inputLine[pos] - '0';
+            sqe_sum += tt * count;
+            sqe_cnt += count; 
+            continue;
+        }
+        //AfterSqe TO BeforeCqe
+        if ((pos=inputLine.find("AfterSqe TO BeforeCqe AVG",0) ) != -1){
+            pos += 28;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos]; 
+                pos++;
+            }
+            stringstream ss;
+            double tt;
+            ss << t;
+            ss >> tt;
+            pos=inputLine.find("weight = ",0);
+            pos +=9;
+            int count = inputLine[pos] - '0';
+            beforeCqe_sum += tt * count;
+            beforeCqe_cnt += count; 
+            continue;
+        }
+
+        //before after put cqe
+        if ((pos=inputLine.find("before after put cqe AVG ",0) ) != -1){
+            pos += 27;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos]; 
+                pos++;
+            }
+            stringstream ss;
+            double tt;
+            ss << t;
+            ss >> tt;
+            pos=inputLine.find("weight = ",0);
+            pos +=9;
+            int count = inputLine[pos] - '0';
+            putCqe_sum += tt * count;
+            putCqe_cnt += count; 
+            continue;
+        }
+        //beforeSqe TO afterCqe 
+        if ((pos=inputLine.find("beforeSqe TO afterCqe AVG = ",0) ) != -1){
+            pos += 28;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos]; 
+                pos++;
+            }
+            stringstream ss;
+            double tt;
+            ss << t;
+            ss >> tt;
+            pos=inputLine.find("weight = ",0);
+            pos +=9;
+            int count = inputLine[pos] - '0';
+            afterCqe_sum += tt * count;
+            afterCqe_cnt += count; 
+            continue;
+        }
+       
+        
+    }
+
+    // before after get sqe
+    for (auto s:sqe){
+        cout << s << endl;
+    }
+    cout <<endl;
+    // AfterSqe TO BeforeCqe
+    for(auto s:beforeCqe)
+        cout << s<<endl;
+    cout<<endl;
+    //before after put cqe 
+    for(auto s:putCqe)
+        cout << s<<endl;
+    cout<<endl;
+    // beforeSqe TO afterCqe 
+    for(auto s :afterCqe)
+        cout<<s<<endl;
+    cout<<endl;
+
+    // occl rank0 time
+    for(auto s:time)
+        cout<<s<<endl;
+    cout << endl<<endl<<endl<<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_QE_ori.cpp b/test_scripts/ofccl/static_ofccl_QE_ori.cpp
new file mode 100644
index 0000000..08794b5
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_QE_ori.cpp
@@ -0,0 +1,120 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+    string inputLine;
+
+    vector<double> sqe_ori;
+    vector<double> beforeCqe_ori;
+    vector<double> putCqe_ori;
+    vector<double> afterCqe_ori;
+    string bw="bandwidth";
+
+    
+    int cnt=0;
+    while(getline(cin, inputLine)){
+        if(inputLine.find(bw,0) != -1){
+            // 判断结束一个输出
+            // before after get sqe
+            
+            if(++cnt == 25)
+            break;
+        }
+        // rank0 time
+        int pos = -1;
+            // before after get sqe
+        if ((pos=inputLine.find("Rank<0> Blk<0> Thrd<0> coll_id = 0, before after get sqe = ",0) ) != -1){
+            pos += 58;
+            string numbers = inputLine.substr(pos);
+            stringstream ss ;
+            ss << numbers;
+            for(int i = 0;i < 5;i++){
+                double tmp;
+                ss >> tmp;
+                sqe_ori.push_back(tmp);
+            }
+            continue;
+        }
+        //AfterSqe TO BeforeCqe
+       if ((pos=inputLine.find("AfterSqe TO BeforeCqe = ",0) ) != -1){
+            pos += 24;
+            string numbers = inputLine.substr(pos);
+            stringstream ss ;
+            ss << numbers;
+            for(int i = 0;i < 5;i++){
+                double tmp;
+                ss >> tmp;
+                if(tmp > 0.00001)
+                    beforeCqe_ori.push_back(tmp);
+            }
+            continue;
+        }
+
+        //before after put cqe
+        if ((pos=inputLine.find("before after put cqe = ",0) ) != -1){
+            pos += 23;
+            string numbers = inputLine.substr(pos);
+            stringstream ss ;
+            ss << numbers;
+            for(int i = 0;i < 5;i++){
+                double tmp;
+                ss >> tmp;
+                if(tmp > 0.00001)
+                    putCqe_ori.push_back(tmp);
+            }
+            continue;
+        }
+
+        //beforeSqe TO afterCqe 
+        if ((pos=inputLine.find("beforeSqe TO afterCqe = ",0) ) != -1){
+            pos += 24;
+            string numbers = inputLine.substr(pos);
+            stringstream ss ;
+            ss << numbers;
+            for(int i = 0;i < 5;i++){
+                double tmp;
+                ss >> tmp;
+                if(tmp > 0.00001)
+                    afterCqe_ori.push_back(tmp);
+            }
+            continue;
+        }
+    }
+
+    // before after get sqe
+    for(int i = 0;i <25;i++){
+        for(int j =0;j < 5;j++)
+            cout<<sqe_ori[i*5+j]<<" ";
+        cout<<endl;
+    }
+    cout <<endl<<endl;
+    // // AfterSqe TO BeforeCqe
+    for(int i = 0;i <25;i++){
+        for(int j =0;j < 5;j++)
+            cout<<beforeCqe_ori[i*5+j]<<" ";
+        cout<<endl;
+    }
+    cout <<endl<<endl;
+
+    //before after put cqe 
+    for(int i = 0;i <25;i++){
+        for(int j =0;j < 5;j++)
+            cout<<putCqe_ori[i*5+j]<<" ";
+        cout<<endl;
+    }
+    cout <<endl<<endl;
+    // beforeSqe TO afterCqe 
+    for(int i = 0;i <25;i++){
+        for(int j =0;j < 5;j++)
+            cout<<afterCqe_ori[i*5+j]<<" ";
+        cout<<endl;
+    }
+    cout <<endl<<endl;
+
+    cout <<endl<<endl<<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_bw.cpp b/test_scripts/ofccl/static_ofccl_bw.cpp
new file mode 100644
index 0000000..4eeba2b
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_bw.cpp
@@ -0,0 +1,43 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+    string inputLine;
+    vector<string> a;
+    vector<string> b;
+    string ss="bandwidth";
+    string str = "N/A";
+    int cnt = 0;
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
+
+        stringstream line;
+        line << inputLine;
+        string tmp;
+        stack<string> ss;
+        while(line >> tmp){
+            ss.push(tmp);
+        }
+        ss.pop();
+        b.push_back(ss.top());
+        ss.pop();
+        a.push_back(ss.top());
+        
+        if(++cnt == 25)
+            break;
+    }
+
+    for(auto a1:a)
+        cout<<a1<<endl;
+
+    cout <<endl;
+    for(auto b1:b)
+        cout<<b1<<endl;
+    cout << endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_bw_order.cpp b/test_scripts/ofccl/static_ofccl_bw_order.cpp
new file mode 100644
index 0000000..d25df28
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_bw_order.cpp
@@ -0,0 +1,46 @@
+#include"bits/stdc++.h"
+using namespace std;
+int main(int argc,char* argv[])
+{
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    int num = *(argv[3]) - '0';
+
+
+    string time;
+    getline(cin, time);
+    vector<priority_queue<double,vector<double>,less<double>>> a(25,priority_queue<double,vector<double>,less<double>>());
+    vector<priority_queue<double,vector<double>,less<double>>> b(25,priority_queue<double,vector<double>,less<double>>());
+
+
+    for(int i = 0;i < num;i++){
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            cin>>tmp;
+            a[j].push(tmp);
+        }
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            cin>>tmp;
+            b[j].push(tmp);
+        }
+    }
+
+    for(int i = 0;i < num;i++){
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            tmp = a[j].top();a[j].pop();
+            cout<<tmp<<endl;
+        }
+        cout<<endl;
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            tmp = b[j].top();b[j].pop();
+            cout<<tmp<<endl;
+        }
+        cout<<endl<<endl;
+    }
+
+
+
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_time.cpp b/test_scripts/ofccl/static_ofccl_time.cpp
new file mode 100644
index 0000000..eed91a9
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_time.cpp
@@ -0,0 +1,40 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+    string inputLine;
+    vector<string> a;
+    vector<string> b;
+    string ss="bandwidth";
+    string str = "N/A";
+    int cnt = 0;
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
+
+        stringstream line;
+        line << inputLine;
+        string tmp;
+        stack<string> ss;
+        while(line >> tmp){
+            ss.push(tmp);
+        }
+        ss.pop();
+        ss.pop();
+        ss.pop();
+        a.push_back(ss.top());
+        
+        if(++cnt == 25)
+            break;
+    }
+
+    for(auto a1:a)
+        cout<<a1<<endl;
+
+    cout <<endl<< endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_tm_order.cpp b/test_scripts/ofccl/static_ofccl_tm_order.cpp
new file mode 100644
index 0000000..05f3b1c
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_tm_order.cpp
@@ -0,0 +1,35 @@
+#include"bits/stdc++.h"
+using namespace std;
+int main(int argc,char* argv[])
+{
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    int num = *(argv[3]) - '0';
+
+
+    string time;
+    getline(cin, time);
+    vector<priority_queue<double,vector<double>,greater<double>>> a(25,priority_queue<double,vector<double>,greater<double>>());
+  
+    for(int i = 0;i < num;i++){
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            cin>>tmp;
+            a[j].push(tmp);
+        }
+       
+    }
+
+    for(int i = 0;i < num;i++){
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            tmp = a[j].top();a[j].pop();
+            cout<<tmp<<endl;
+        }
+        
+        cout<<endl<<endl;
+    }
+
+
+
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_totalCnt.cpp b/test_scripts/ofccl/static_ofccl_totalCnt.cpp
new file mode 100644
index 0000000..c1f78ee
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_totalCnt.cpp
@@ -0,0 +1,124 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+   string inputLine;
+    vector<vector<int>> save_ori(25,vector<int>());
+    vector<vector<int>> load_ori(25,vector<int>());
+    vector<vector<int>> p7s_ori(25,vector<int>());
+    vector<vector<int>> quit_ori(25,vector<int>());
+    
+    vector<double> save_avg;
+    vector<double> load_avg;
+    vector<double> p7s_avg;
+    vector<double> quit_avg;
+   
+    string bw="bandwidth";
+
+    int cnt=0;
+    while(getline(cin, inputLine)){
+        if(inputLine.find(bw,0) != -1){
+            // 判断结束一个输出
+            // save
+            double sum = accumulate(begin(save_ori[cnt]), end(save_ori[cnt]), 0);
+            double mean =  sum / save_ori[cnt].size();
+            save_avg.push_back(mean);
+            // load
+            sum = accumulate(begin(load_ori[cnt]), end(load_ori[cnt]),0);
+            mean = sum / load_ori[cnt].size();
+            load_avg.push_back(mean);
+            // p7s
+            sum = accumulate(begin(p7s_ori[cnt]), end(p7s_ori[cnt]),0);
+            mean = sum / p7s_ori[cnt].size();
+            p7s_avg.push_back(mean);
+            // quit
+            sum = accumulate(begin(quit_ori[cnt]), end(quit_ori[cnt]),0);
+            mean = sum / quit_ori[cnt].size();
+            quit_avg.push_back(mean);
+
+            if(++cnt == 25)
+                break;
+        }
+      
+        int pos = 0;
+            // save
+        while((pos=inputLine.find("totalCtxSaveCnt=",pos) ) != -1){
+            pos += 16;
+            int number = 0;
+            while(inputLine[pos]>='0' &&inputLine[pos]<='9'){
+                number = number*10 + (inputLine[pos]-'0');
+                pos++;
+            }
+            save_ori[cnt].push_back(number);
+        }
+        pos=0;
+        while((pos=inputLine.find("totalCtxLoadCnt=",pos) ) != -1){
+            pos += 16;
+            int number = 0;
+            while(inputLine[pos]>='0' &&inputLine[pos]<='9'){
+                number = number*10 + (inputLine[pos]-'0');
+                pos++;
+            }
+            load_ori[cnt].push_back(number);
+        }
+
+        pos=0;
+        while((pos=inputLine.find("totalProgressed7SwithchCnt=",pos) ) != -1){
+            pos += 27;
+            int number = 0;
+            while(inputLine[pos]>='0' &&inputLine[pos]<='9'){
+                number = number*10 + (inputLine[pos]-'0');
+                pos++;
+            }
+            p7s_ori[cnt].push_back(number);
+        }
+
+        pos=0;
+        while((pos=inputLine.find("totalUnprogressedQuitCnt=",pos) ) != -1){
+            pos += 25;
+            int number = 0;
+            while(inputLine[pos]>='0' &&inputLine[pos]<='9'){
+                number = number*10 + (inputLine[pos]-'0');
+                pos++;
+            }
+            quit_ori[cnt].push_back(number);
+        }
+
+        
+    }
+
+    
+    for(int i = 0;i < 25;i++){
+        cout << save_avg[i]<<" ";
+        for(auto num:save_ori[i])
+            cout<<num<<" ";
+        cout<<endl;
+    }
+
+
+    for(int i =0;i < 25;i++){
+        cout<<load_avg[i]<<" ";
+        for(auto num:load_ori[i])
+            cout<<num<<" ";
+        cout<<endl;
+    }
+    for(int i =0;i < 25;i++){
+        cout<<p7s_avg[i]<<" ";
+        for(auto num:p7s_ori[i])
+            cout<<num<<" ";
+        cout<<endl;
+    }
+
+    for(int i =0;i < 25;i++){
+        cout<<quit_avg[i]<<" ";
+        for(auto num:quit_ori[i])
+            cout<<num<<" ";
+        cout<<endl;
+    }
+    cout <<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/statics_totalCtx.cpp b/test_scripts/ofccl/statics_totalCtx.cpp
new file mode 100644
index 0000000..e52146e
--- /dev/null
+++ b/test_scripts/ofccl/statics_totalCtx.cpp
@@ -0,0 +1,51 @@
+#include"bits/stdc++.h"
+using namespace std;
+int main(int argc,char* argv[]){
+    cout <<"totalCtxSwitchCnt:"<<" " << argv[1]<< " " << argv[2]<<endl;
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    cout << argv[1]<<" totalCtxSwitchCnt: "<<endl;
+    char c;
+    int cnt=0;
+    int sum=0;
+    bool flag = false;
+    bool flag2 = false;
+    string  a ="totalCtxSwitchCnt=";
+    string b="bandwidth";
+    while(cin >>c){
+        if(c == '!')
+        break;
+        flag =true;
+        flag2 =true;
+        for(int i =0;i < a.size();i++){
+            if( c != a[i]){
+                flag = false;
+            }
+            if(i < b.size() && c != b[i]){
+                flag2 = false;
+            }
+            if(flag == false && flag2 == false)
+                break;
+            cin >> c;
+        }
+        if(flag){
+            cnt++;
+            int tmp = 0;
+            while( c >= '0' && c<= '9'){
+                tmp = tmp*10 + c -'0';
+                scanf("%c",&c); 
+            }
+            sum += tmp;
+        }
+        if(flag2){
+           cout << (sum * 1.0)/cnt<<endl;
+           cnt = 0;
+           sum = 0;
+        }
+    }
+    cout <<endl<<endl;
+    cout <<"*************"<<endl;
+   
+    return 0;
+}