From 93a5cda770c028915db8efec274f10ac88570c53 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 8 Jul 2022 07:08:25 +0000
Subject: [PATCH 001/109] add all_reduce_group test

---
 src/Makefile            |   2 +-
 src/all_reduce_group.cu | 126 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 src/all_reduce_group.cu
diff --git a/src/Makefile b/src/Makefile
index 2a399db..977aa02 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -75,7 +75,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
+BIN_FILES_LIST := all_reduce all_reduce_group all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/all_reduce_group.cu b/src/all_reduce_group.cu
new file mode 100644
index 0000000..88e9b3d
--- /dev/null
+++ b/src/all_reduce_group.cu
@@ -0,0 +1,126 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include <stdio.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  ncclGroupStart();
+  printf("ofccl_nccl_test group start\n");
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("ofccl_nccl_test 1st allreduce\n");
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("ofccl_nccl_test 2nd allreduce\n");
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("ofccl_nccl_test 3rd allreduce\n");
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("ofccl_nccl_test 4th allreduce\n");
+  ncclGroupEnd();
+  printf("ofccl_nccl_test group end\n");
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
+
+#pragma weak ncclTestEngine=allReduceEngine

From 725b0d78c350f06e217ae1e97f122345353adcac Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Wed, 13 Jul 2022 09:05:00 +0000
Subject: [PATCH 002/109] simple group allreduce

---
 src/Makefile                            |    2 +-
 src_simple/Makefile                     |   95 ++
 {src => src_simple}/all_reduce_group.cu |   81 +-
 src_simple/common_simple.cu             | 1222 +++++++++++++++++++++++
 src_simple/common_simple.h              |  275 +++++
 src_simple/nccl1_compat.h               |   50 +
 6 files changed, 1687 insertions(+), 38 deletions(-)
 create mode 100644 src_simple/Makefile
 rename {src => src_simple}/all_reduce_group.cu (71%)
 create mode 100644 src_simple/common_simple.cu
 create mode 100644 src_simple/common_simple.h
 create mode 100644 src_simple/nccl1_compat.h

diff --git a/src/Makefile b/src/Makefile
index 977aa02..2a399db 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -75,7 +75,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_reduce_group all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src_simple/Makefile b/src_simple/Makefile
new file mode 100644
index 0000000..35ba3bb
--- /dev/null
+++ b/src_simple/Makefile
@@ -0,0 +1,95 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC = $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_80,code=sm_80 \
+                -gencode=arch=compute_80,code=compute_80
+else
+NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_70,code=compute_70
+endif
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
+endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
+LIBRARIES += nccl
+NVLDFLAGS += $(LIBRARIES:%=-l%)
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := all_reduce_group
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+${DST_DIR}/%.o: %.cu common_simple.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_simple.o
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/src/all_reduce_group.cu b/src_simple/all_reduce_group.cu
similarity index 71%
rename from src/all_reduce_group.cu
rename to src_simple/all_reduce_group.cu
index 88e9b3d..1d484d7 100644
--- a/src/all_reduce_group.cu
+++ b/src_simple/all_reduce_group.cu
@@ -5,8 +5,9 @@
  ************************************************************************/
 
 #include "cuda_runtime.h"
-#include "common.h"
+#include "common_simple.h"
 #include <stdio.h>
+#include <unistd.h>
 
 void print_header() {
   PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
@@ -55,18 +56,20 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
 }
 
 testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  static int round;
   ncclGroupStart();
-  printf("ofccl_nccl_test group start\n");
+  printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round);
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("ofccl_nccl_test 1st allreduce\n");
+  printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round);
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("ofccl_nccl_test 2nd allreduce\n");
+  printf("<%d> %d ofccl_nccl_test 2nd allreduce\n", getpid(), round);
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("ofccl_nccl_test 3rd allreduce\n");
+  printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round);
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("ofccl_nccl_test 4th allreduce\n");
+  printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round);
   ncclGroupEnd();
-  printf("ofccl_nccl_test group end\n");
+  printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round);
+  round++;
   return testSuccess;
 }
 
@@ -85,36 +88,40 @@ void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, in
 
 testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
   args->collTest = &allReduceTest;
-  ncclDataType_t *run_types;
-  ncclRedOp_t *run_ops;
-  const char **run_typenames, **run_opnames;
-  int type_count, op_count;
-
-  if ((int)type != -1) {
-    type_count = 1;
-    run_types = &type;
-    run_typenames = &typeName;
-  } else {
-    type_count = test_typenum;
-    run_types = test_types;
-    run_typenames = test_typenames;
-  }
-
-  if ((int)op != -1) {
-    op_count = 1;
-    run_ops = &op;
-    run_opnames = &opName;
-  } else {
-    op_count = test_opnum;
-    run_ops = test_ops;
-    run_opnames = test_opnames;
-  }
-
-  for (int i=0; i<type_count; i++) {
-    for (int j=0; j<op_count; j++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
-    }
-  }
+  // ncclDataType_t *run_types;
+  // ncclRedOp_t *run_ops;
+  // const char **run_typenames, **run_opnames;
+  // int type_count, op_count;
+
+  // if ((int)type != -1) {
+  //   type_count = 1;
+  //   run_types = &type;
+  //   run_typenames = &typeName;
+  // } else {
+  //   type_count = test_typenum;
+  //   run_types = test_types;
+  //   run_typenames = test_typenames;
+  // }
+
+  // if ((int)op != -1) {
+  //   op_count = 1;
+  //   run_ops = &op;
+  //   run_opnames = &opName;
+  // } else {
+  //   op_count = test_opnum;
+  //   run_ops = test_ops;
+  //   run_opnames = test_opnames;
+  // }
+
+  // for (int i=0; i<type_count; i++) {
+  //   for (int j=0; j<op_count; j++) {
+  //     TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+  //   }
+  // }
+  static int test_round = 0;
+  printf("<%d> %d ofccl_nccl_test invoke TimeTest\n", getpid(), test_round);
+  test_round++;
+  TESTCHECK(TimeTest(args, ncclFloat, "float", ncclSum, "sum", -1));
   return testSuccess;
 }
 
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
new file mode 100644
index 0000000..d1e5622
--- /dev/null
+++ b/src_simple/common_simple.cu
@@ -0,0 +1,1222 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_simple.h"
+#include <pthread.h>
+#include <cstdio>
+#include <getopt.h>
+#include <libgen.h>
+#include "cuda.h"
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+#if NCCL_MAJOR >= 2
+  ncclDataType_t test_types[ncclNumTypes] = {
+    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclBfloat16
+  #endif
+  };
+  const char *test_typenames[ncclNumTypes] = {
+    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , "bfloat16"
+  #endif
+  };
+  int test_typenum = -1;
+
+  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclAvg
+  #endif
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+  #endif
+  };
+  int test_opnum = -1;
+#else
+  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+  int test_typenum = 7;
+  const char *test_opnames[] = {"sum", "prod", "max", "min"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+  int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32*1024*1024;
+static size_t maxBytes = 32*1024*1024;
+static size_t stepBytes = 1*1024*1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static double parsesize(const char *value) {
+    long long int units;
+    double size;
+    char size_lit;
+
+    int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+    switch (count) {
+    case 2:
+      switch (size_lit) {
+      case 'G':
+      case 'g':
+        units = 1024*1024*1024;
+        break;
+      case 'M':
+      case 'm':
+        units = 1024*1024;
+        break;
+      case 'K':
+      case 'k':
+        units = 1024;
+        break;
+      default:
+        return -1.0;
+      };
+      break;
+    case 1:
+      units = 1;
+      break;
+    default:
+      return -1.0;
+    }
+
+    return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch(type) {
+    case ncclHalf: return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16: return 1e-2;
+#endif
+    case ncclFloat: return 1e-5;
+    case ncclDouble: return 1e-12;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+    //case ncclInt32:
+    case ncclUint32:
+#endif
+    case ncclInt64:
+    case ncclUint64: return 1e-200;
+  }
+  return 1e-200;
+}
+
+template<typename T> __device__
+double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template<> __device__
+double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y-x));
+}
+
+template<typename T> __device__
+float toFloat(T a) {
+  return (float)a;
+}
+template<> __device__
+float toFloat(half a) {
+  return __half2float(a);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<> __device__
+float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template<typename T, int BSIZE> __global__
+void deltaKern(void* A_, void* B_, size_t count, double* max) {
+  const T* A = (const T*)A_;
+  const T* B = (const T*)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if( delta > locmax ) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
+    __syncthreads();
+    if( tid < stride )
+      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
+  }
+  __syncthreads();
+  if( threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template<typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep+rank+offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template<>
+__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template<typename T>
+__device__ T ncclOpSum(T a, T b) { return a+b; }
+template<typename T>
+__device__ T ncclOpProd(T a, T b) { return a*b; }
+template<typename T>
+__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
+template<typename T>
+__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
+
+// Definitions for half
+template<>
+__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
+template<>
+__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
+template<>
+__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
+template<>
+__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
+
+template<typename T>
+__device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template<typename T>
+__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
+template<typename T>
+__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
+template<>
+__device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x)*float(arg));
+}
+template<>
+__device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x)/n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x)*float(arg));
+}
+template<>
+__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x)/n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) {
+  return 1 + rank%2;
+}
+
+template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
+__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
+    T val = testValue<T>(o+offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i=1; i<nranks; i++) {
+      T val1 = testValue<T>(o+offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
+    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
+  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template<typename T>
+__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void* const initDataKerns[ncclNumTypes] = {
+  (void*)InitDataKernel<  int8_t>,
+  (void*)InitDataKernel< uint8_t>,
+  (void*)InitDataKernel< int32_t>,
+  (void*)InitDataKernel<uint32_t>,
+  (void*)InitDataKernel< int64_t>,
+  (void*)InitDataKernel<uint64_t>,
+  (void*)InitDataKernel<    half>,
+  (void*)InitDataKernel<   float>,
+  (void*)InitDataKernel<  double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  (void*)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template<typename T>
+testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
+  T* ptr = (T*)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs* args) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  args->barrier_idx=!args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs* args, double* value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1) val += val2;
+    if (average == 2) val = std::min(val, val2);
+    if (average == 3) val = std::max(val, val2);
+  }
+  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
+    args->reduce[1-args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx=!args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int* done = (int*)malloc(sizeof(int)*ngpus);
+  memset(done, 0, sizeof(int)*ngpus);
+  while (remaining) {
+   int idle = 1;
+   for (int i=0; i<ngpus; i++) {
+     if (done[i]) continue;
+
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
+       done[i] = 1;
+       remaining--;
+       idle = 0;
+       continue;
+     }
+
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
+       ncclResult_t ncclAsyncErr;
+       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+       if (ncclAsyncErr != ncclSuccess) {
+         // An asynchronous error happened. Stop the operation and destroy
+         // the communicator
+         for (int i=0; i<ngpus; i++)
+           NCCLCHECK(ncclCommAbort(comms[i]));
+         // Abort the perf test
+         NCCLCHECK(ncclAsyncErr);
+       }
+     }
+#endif
+   }
+
+   // We might want to let other threads (including NCCL threads) use the CPU.
+   if (idle) pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) {
+    // printf("startColl, args->nGpus > 1 run ncclGroupStart\n");
+    NCCLCHECK(ncclGroupStart());
+  }
+  for (int i = 0; i < args->nGpus; i++) {
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
+    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    ncclRedOp_t op;
+
+    if(opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    else {
+      union {
+        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
+        half f16; float f32; double f64;
+        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+        #endif
+      };
+      int scalar = preMulScalar(rank);
+      switch(type) {
+      case ncclInt8: i8 = int8_t(scalar); break;
+      case ncclUint8: u8 = uint8_t(scalar); break;
+      case ncclInt32: i32 = int32_t(scalar); break;
+      case ncclUint32: u32 = uint32_t(scalar); break;
+      case ncclInt64: i64 = int32_t(scalar); break;
+      case ncclUint64: u64 = uint32_t(scalar); break;
+      case ncclFloat16: f16 = __float2half(float(scalar)); break;
+      case ncclFloat32: f32 = float(scalar); break;
+      case ncclFloat64: f64 = double(scalar); break;
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
+      #endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
+    }
+    #endif
+
+    TESTCHECK(args->collTest->runColl(
+          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
+          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
+        count, type, op, root, args->comms[i], args->streams[i]));
+
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    if(opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
+    }
+    #endif
+  }
+  if (args->nGpus > 1) {
+    // printf("startColl, args->nGpus > 1 run ncclGroupEnd\n");
+    NCCLCHECK(ncclGroupEnd());
+  }
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll) Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs* args) {
+  if (blocking_coll) return testSuccess;
+
+  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+  size_t count = args->nbytes / wordSize(type);
+  if (datacheck) {
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
+  }
+
+//   // Sync
+//   TESTCHECK(startColl(args, type, op, root, in_place, 0));
+//   TESTCHECK(completeColl(args));
+
+//   Barrier(args);
+
+// #if CUDART_VERSION >= 11030
+//   cudaGraph_t graphs[args->nGpus];
+//   cudaGraphExec_t graphExec[args->nGpus];
+//   if (cudaGraphLaunches >= 1) {
+//     // Begin cuda graph capture
+//     for (int i=0; i<args->nGpus; i++) {
+//       // Thread local mode is needed for:
+//       // - Multi-thread mode
+//       // - P2P pre-connect
+//       CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+//     }
+//   }
+// #endif
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < iters; iter++) {
+    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
+    for (int aiter = 0; aiter < agg_iters; aiter++) {
+      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
+    }
+    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
+  }
+
+// #if CUDART_VERSION >= 11030
+//   if (cudaGraphLaunches >= 1) {
+//     // End cuda graph capture
+//     for (int i=0; i<args->nGpus; i++) {
+//       CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+//     }
+//     // Instantiate cuda graph
+//     for (int i=0; i<args->nGpus; i++) {
+//       CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+//     }
+//     // Resync CPU, restart timing, launch cuda graph
+//     Barrier(args);
+//     start = std::chrono::high_resolution_clock::now();
+//     for (int l=0; l<cudaGraphLaunches; l++) {
+//       for (int i=0; i<args->nGpus; i++) {
+//         CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+//       }
+//     }
+//   }
+// #endif
+
+  TESTCHECK(completeColl(args));
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec/(iters*agg_iters);
+  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+// #if CUDART_VERSION >= 11030
+//   if (cudaGraphLaunches >= 1) {
+//     //destroy cuda graph
+//     for (int i=0; i<args->nGpus; i++) {
+//       CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+//       CUDACHECK(cudaGraphDestroy(graphs[i]));
+//     }
+//   }
+// #endif
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+
+  Barrier(args);
+
+  double maxDelta = 0;
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) {
+      // Initialize sendbuffs, recvbuffs and expected
+      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
+
+// #if CUDART_VERSION >= 11030
+//       if (cudaGraphLaunches >= 1) {
+//         // Begin cuda graph capture for data check
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
+//         }
+//       }
+// #endif
+
+      //test validation in single itertion, should ideally be included into the multi-iteration run
+      // TESTCHECK(startColl(args, type, op, root, in_place, 0));
+
+// #if CUDART_VERSION >= 11030
+//       if (cudaGraphLaunches >= 1) {
+//         // End cuda graph capture
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+//         }
+//         // Instantiate cuda graph
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+//         }
+//         // Launch cuda graph
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+//         }
+//       }
+// #endif
+
+      // TESTCHECK(completeColl(args));
+
+// #if CUDART_VERSION >= 11030
+//       if (cudaGraphLaunches >= 1) {
+//         //destroy cuda graph
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+//           CUDACHECK(cudaGraphDestroy(graphs[i]));
+//         }
+//       }
+// #endif
+
+      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+
+      //aggregate delta from all threads and procs
+      Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec*1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+  // // Warm-up for large size
+  // setupArgs(args->maxbytes, type, args);
+  // for (int iter = 0; iter < warmup_iters; iter++) {
+  //   TESTCHECK(startColl(args, type, op, root, 0, iter));
+  // }
+  // TESTCHECK(completeColl(args));
+
+  // // Warm-up for small size
+  // setupArgs(args->minbytes, type, args);
+  // for (int iter = 0; iter < warmup_iters; iter++) {
+  //   TESTCHECK(startColl(args, type, op, root, 0, iter));
+  // }
+  // TESTCHECK(completeColl(args));
+
+  // Benchmark
+  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+      setupArgs(size, type, args);
+      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+      TESTCHECK(BenchTime(args, type, op, root, 0));
+      // TESTCHECK(BenchTime(args, type, op, root, 1));
+      PRINT("\n");
+  }
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs* args) {
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs* args) {
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks =  args->nProcs*args->nThreads*args->nGpus;
+
+  //set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i=0; i<args->nGpus; i++) {
+    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i=0; i<args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void* threadLauncher(void* thread_) {
+  struct testThread* thread = (struct testThread*)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread* thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
+    return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char* argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+    ncclGetVersion(&test_ncclVersion);
+  #else
+    test_ncclVersion = NCCL_VERSION_CODE;
+  #endif
+  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
+    test_opnum = 4;
+    test_typenum = 9;
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_opnum++; // ncclAvg
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+        test_typenum++; // bfloat16
+      #endif
+    }
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
+      test_opnum++; // PreMulSum
+    }
+  #endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+    {"nthreads", required_argument, 0, 't'},
+    {"ngpus", required_argument, 0, 'g'},
+    {"minbytes", required_argument, 0, 'b'},
+    {"maxbytes", required_argument, 0, 'e'},
+    {"stepbytes", required_argument, 0, 'i'},
+    {"stepfactor", required_argument, 0, 'f'},
+    {"iters", required_argument, 0, 'n'},
+    {"agg_iters", required_argument, 0, 'm'},
+    {"warmup_iters", required_argument, 0, 'w'},
+    {"parallel_init", required_argument, 0, 'p'},
+    {"check", required_argument, 0, 'c'},
+    {"op", required_argument, 0, 'o'},
+    {"datatype", required_argument, 0, 'd'},
+    {"root", required_argument, 0, 'r'},
+    {"blocking", required_argument, 0, 'z'},
+    {"cudagraph", required_argument, 0, 'G'},
+    {"average", required_argument, 0, 'a'},
+    {"help", no_argument, 0, 'h'},
+    {}
+  };
+
+  while(1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
+
+    if (c == -1)
+      break;
+
+    switch(c) {
+      case 't':
+        nThreads = strtol(optarg, NULL, 0);
+        break;
+      case 'g':
+        nGpus = strtol(optarg, NULL, 0);
+        break;
+      case 'b':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'minbytes'\n");
+          return -1;
+        }
+        minBytes = (size_t)parsed;
+        break;
+      case 'e':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+          return -1;
+        }
+        maxBytes = (size_t)parsed;
+        break;
+      case 'i':
+        stepBytes = strtol(optarg, NULL, 0);
+        break;
+      case 'f':
+        stepFactor = strtol(optarg, NULL, 0);
+        break;
+      case 'n':
+        iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+        agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+        break;
+      case 'w':
+        warmup_iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'c':
+        datacheck = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'p':
+        parallel_init = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'o':
+        ncclop = ncclstringtoop(optarg);
+        break;
+      case 'd':
+        nccltype = ncclstringtotype(optarg);
+        break;
+      case 'r':
+        ncclroot = strtol(optarg, NULL, 0);
+        break;
+      case 'z':
+        blocking_coll = strtol(optarg, NULL, 0);
+        break;
+      case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
+        cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+#endif
+        break;
+      case 'a':
+        average = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'h':
+      default:
+        if (c != 'h') printf("invalid option '%c'\n", c);
+        printf("USAGE: %s \n\t"
+            "[-t,--nthreads <num threads>] \n\t"
+            "[-g,--ngpus <gpus per thread>] \n\t"
+            "[-b,--minbytes <min size in bytes>] \n\t"
+            "[-e,--maxbytes <max size in bytes>] \n\t"
+            "[-i,--stepbytes <increment size>] \n\t"
+            "[-f,--stepfactor <increment factor>] \n\t"
+            "[-n,--iters <iteration count>] \n\t"
+            "[-m,--agg_iters <aggregated iteration count>] \n\t"
+            "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-p,--parallel_init <0/1>] \n\t"
+            "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+            "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+            "[-d,--datatype <nccltype/all>] \n\t"
+            "[-r,--root <root>] \n\t"
+            "[-z,--blocking <0/1>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
+            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+            "[-h,--help]\n",
+	    basename(argv[0]));
+	return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+           (unsigned long long)minBytes,
+           (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
+  for (int p=0; p<nProcs; p++) {
+    if (p == proc) break;
+    if (hostHashs[p] == hostHashs[proc]) localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i=0; i<nThreads*nGpus; i++) {
+    int cudaDev = localRank*nThreads*nGpus+i;
+    int rank = proc*nThreads*nGpus+i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines+MAX_LINE*p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
+  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+  }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus*nThreads];
+  void* sendbuffs[nGpus*nThreads];
+  void* recvbuffs[nGpus*nThreads];
+  void* expected[nGpus*nThreads];
+  size_t sendBytes, recvBytes;
+
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+
+  for (int i=0; i<nGpus*nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+  }
+
+  //if parallel init is not selected, use main thread to initialize NCCL
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+  if (!parallel_init) {
+     if (nProcs == 1) {
+       int gpuArray[nGpus*nThreads];
+       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
+     } else {
+       NCCLCHECK(ncclGroupStart());
+       for (int i=0; i<nGpus*nThreads; i++) {
+         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
+       }
+       NCCLCHECK(ncclGroupEnd());
+     }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double* delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t=0; t<nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int* sync = (int*)calloc(2, sizeof(int));
+  int* barrier = (int*)calloc(2, sizeof(int));
+  double* reduce = (double*)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread)*nThreads);
+
+  for (int t=nThreads-1; t>=0; t--) {
+    threads[t].args.minbytes=minBytes;
+    threads[t].args.maxbytes=maxBytes;
+    threads[t].args.stepbytes=stepBytes;
+    threads[t].args.stepfactor=stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs=nProcs;
+    threads[t].args.proc=proc;
+    threads[t].args.nThreads=nThreads;
+    threads[t].args.thread=t;
+    threads[t].args.nGpus=nGpus;
+    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
+    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
+    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms=comms+t*nGpus;
+    threads[t].args.streams=streams+t*nGpus;
+
+    threads[t].args.barrier = (volatile int*)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double*)reduce;
+    threads[t].args.sync = (volatile int*)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
+    threads[t].args.errors=errors+t;
+    threads[t].args.bw=bw+t;
+    threads[t].args.bw_count=bw_count+t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads+t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t=nThreads-1; t>=0; t--) {
+    if (t) pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for(int i=0; i<nGpus*nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i=0; i<nGpus*nThreads; i++) {
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (datacheck) CUDACHECK(cudaFree(expected[i]));
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw*(0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
new file mode 100644
index 0000000..bd84d01
--- /dev/null
+++ b/src_simple/common_simple.h
@@ -0,0 +1,275 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "nccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int localRank;
+  void** sendbuffs;
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void** recvbuffs;
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expected;
+  size_t expectedBytes;
+  volatile int* sync;
+  int sync_idx;
+  volatile int* barrier;
+  int barrier_idx;
+  volatile double* reduce;
+  int syncRank;
+  int syncNranks;
+  double* deltaHost;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+#include <chrono>
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+
+// Provided by each coll
+extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
+extern void print_header();
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64: 
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+#endif
diff --git a/src_simple/nccl1_compat.h b/src_simple/nccl1_compat.h
new file mode 100644
index 0000000..020a4bc
--- /dev/null
+++ b/src_simple/nccl1_compat.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif

From 06c97daf8c8b4c3d1cb4d9952eadf9200ce0ed4e Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 14 Jul 2022 12:10:11 +0000
Subject: [PATCH 003/109] nccl group bigger

---
 src_simple/all_reduce_group.cu | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src_simple/all_reduce_group.cu b/src_simple/all_reduce_group.cu
index 1d484d7..9a702ec 100644
--- a/src_simple/all_reduce_group.cu
+++ b/src_simple/all_reduce_group.cu
@@ -59,6 +59,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   static int round;
   ncclGroupStart();
   printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round);
+
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
   printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round);
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
@@ -67,6 +68,15 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round);
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
   printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 5th allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 6th allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 7th allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 8th allreduce\n", getpid(), round);
+
   ncclGroupEnd();
   printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round);
   round++;

From 560f9eb298164312b7c59036c8029dc1a85fb912 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 16 Jul 2022 04:18:43 +0000
Subject: [PATCH 004/109] log for ncclGroupStart/End

---
 src/nccl1_compat.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h
index 020a4bc..32f04e6 100644
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
@@ -3,7 +3,7 @@
  *
  * See LICENSE.txt for license information
  ************************************************************************/
-
+#include <stdio.h>
 #ifndef NCCL1_COMPAT_H
 #define NCCL1_COMPAT_H
 
@@ -14,8 +14,8 @@
 #define ncclNumOps nccl_NUM_OPS
 #define ncclNumTypes nccl_NUM_TYPES
 
-static ncclResult_t ncclGroupStart() { return ncclSuccess; }
-static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+static ncclResult_t ncclGroupStart() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; }
 
 #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
 

From abe8f5c27ae9865cef91e305e0c5107bc158e757 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 16 Jul 2022 04:34:50 +0000
Subject: [PATCH 005/109] add non group all_reduce in simple

---
 src_simple/Makefile             |   2 +-
 src_simple/all_reduce_simple.cu | 114 ++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 src_simple/all_reduce_simple.cu

diff --git a/src_simple/Makefile b/src_simple/Makefile
index 35ba3bb..86267b2 100644
--- a/src_simple/Makefile
+++ b/src_simple/Makefile
@@ -75,7 +75,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce_group
+BIN_FILES_LIST := all_reduce_group all_reduce_simple
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src_simple/all_reduce_simple.cu b/src_simple/all_reduce_simple.cu
new file mode 100644
index 0000000..bdeeb48
--- /dev/null
+++ b/src_simple/all_reduce_simple.cu
@@ -0,0 +1,114 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common_simple.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
+
+#pragma weak ncclTestEngine=allReduceEngine

From 6990dc34321830400629e989ddd71e9b07c8b990 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 16 Jul 2022 04:39:37 +0000
Subject: [PATCH 006/109] half ofccl_all_reduce

---
 src_simple/Makefile            |   2 +-
 src_simple/ofccl_all_reduce.cu | 143 +++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 src_simple/ofccl_all_reduce.cu

diff --git a/src_simple/Makefile b/src_simple/Makefile
index 35ba3bb..5e56588 100644
--- a/src_simple/Makefile
+++ b/src_simple/Makefile
@@ -75,7 +75,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce_group
+BIN_FILES_LIST := all_reduce_group ofccl_all_reduce
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
new file mode 100644
index 0000000..9a702ec
--- /dev/null
+++ b/src_simple/ofccl_all_reduce.cu
@@ -0,0 +1,143 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  static int round;
+  ncclGroupStart();
+  printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round);
+
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 2nd allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 5th allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 6th allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 7th allreduce\n", getpid(), round);
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> %d ofccl_nccl_test 8th allreduce\n", getpid(), round);
+
+  ncclGroupEnd();
+  printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round);
+  round++;
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  // ncclDataType_t *run_types;
+  // ncclRedOp_t *run_ops;
+  // const char **run_typenames, **run_opnames;
+  // int type_count, op_count;
+
+  // if ((int)type != -1) {
+  //   type_count = 1;
+  //   run_types = &type;
+  //   run_typenames = &typeName;
+  // } else {
+  //   type_count = test_typenum;
+  //   run_types = test_types;
+  //   run_typenames = test_typenames;
+  // }
+
+  // if ((int)op != -1) {
+  //   op_count = 1;
+  //   run_ops = &op;
+  //   run_opnames = &opName;
+  // } else {
+  //   op_count = test_opnum;
+  //   run_ops = test_ops;
+  //   run_opnames = test_opnames;
+  // }
+
+  // for (int i=0; i<type_count; i++) {
+  //   for (int j=0; j<op_count; j++) {
+  //     TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+  //   }
+  // }
+  static int test_round = 0;
+  printf("<%d> %d ofccl_nccl_test invoke TimeTest\n", getpid(), test_round);
+  test_round++;
+  TESTCHECK(TimeTest(args, ncclFloat, "float", ncclSum, "sum", -1));
+  return testSuccess;
+}
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
+
+#pragma weak ncclTestEngine=allReduceEngine

From b8a749a96776893c5c503780233163ac2058abd3 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 16 Jul 2022 09:15:44 +0000
Subject: [PATCH 007/109] Simple not really necessary, yet no harm to keep it

---
 src_simple/common_simple.cu        |  137 +---
 src_simple/common_simple.cu.pure   | 1216 +++++++++++++++++++++++++++
 src_simple/common_simple.cu.simple | 1222 ++++++++++++++++++++++++++++
 3 files changed, 2460 insertions(+), 115 deletions(-)
 create mode 100644 src_simple/common_simple.cu.pure
 create mode 100644 src_simple/common_simple.cu.simple

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index d1e5622..0d88bb3 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -518,7 +518,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   size_t shift = totalnbytes * (iter % steps);
 
   if (args->nGpus > 1) {
-    // printf("startColl, args->nGpus > 1 run ncclGroupStart\n");
+    printf("\nstartColl, args->nGpus > 1 run ncclGroupStart\n");
     NCCLCHECK(ncclGroupStart());
   }
   for (int i = 0; i < args->nGpus; i++) {
@@ -575,7 +575,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     #endif
   }
   if (args->nGpus > 1) {
-    // printf("startColl, args->nGpus > 1 run ncclGroupEnd\n");
+    printf("\nstartColl, args->nGpus > 1 run ncclGroupEnd\n");
     NCCLCHECK(ncclGroupEnd());
   }
 
@@ -601,25 +601,25 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
   }
 
-//   // Sync
-//   TESTCHECK(startColl(args, type, op, root, in_place, 0));
-//   TESTCHECK(completeColl(args));
-
-//   Barrier(args);
-
-// #if CUDART_VERSION >= 11030
-//   cudaGraph_t graphs[args->nGpus];
-//   cudaGraphExec_t graphExec[args->nGpus];
-//   if (cudaGraphLaunches >= 1) {
-//     // Begin cuda graph capture
-//     for (int i=0; i<args->nGpus; i++) {
-//       // Thread local mode is needed for:
-//       // - Multi-thread mode
-//       // - P2P pre-connect
-//       CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
-//     }
-//   }
-// #endif
+  // Sync
+  TESTCHECK(startColl(args, type, op, root, in_place, 0));
+  TESTCHECK(completeColl(args));
+
+  Barrier(args);
+
+#if CUDART_VERSION >= 11030
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
+  if (cudaGraphLaunches >= 1) {
+    // Begin cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      // Thread local mode is needed for:
+      // - Multi-thread mode
+      // - P2P pre-connect
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+    }
+  }
+#endif
 
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
@@ -631,27 +631,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
   }
 
-// #if CUDART_VERSION >= 11030
-//   if (cudaGraphLaunches >= 1) {
-//     // End cuda graph capture
-//     for (int i=0; i<args->nGpus; i++) {
-//       CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
-//     }
-//     // Instantiate cuda graph
-//     for (int i=0; i<args->nGpus; i++) {
-//       CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
-//     }
-//     // Resync CPU, restart timing, launch cuda graph
-//     Barrier(args);
-//     start = std::chrono::high_resolution_clock::now();
-//     for (int l=0; l<cudaGraphLaunches; l++) {
-//       for (int i=0; i<args->nGpus; i++) {
-//         CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
-//       }
-//     }
-//   }
-// #endif
-
   TESTCHECK(completeColl(args));
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
@@ -660,15 +639,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
   Allreduce(args, &deltaSec, average);
 
-// #if CUDART_VERSION >= 11030
-//   if (cudaGraphLaunches >= 1) {
-//     //destroy cuda graph
-//     for (int i=0; i<args->nGpus; i++) {
-//       CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
-//       CUDACHECK(cudaGraphDestroy(graphs[i]));
-//     }
-//   }
-// #endif
 
   double algBw, busBw;
   args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
@@ -678,56 +648,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   double maxDelta = 0;
   static __thread int rep = 0;
   rep++;
-  if (datacheck) {
-      // Initialize sendbuffs, recvbuffs and expected
-      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
-
-// #if CUDART_VERSION >= 11030
-//       if (cudaGraphLaunches >= 1) {
-//         // Begin cuda graph capture for data check
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
-//         }
-//       }
-// #endif
-
-      //test validation in single itertion, should ideally be included into the multi-iteration run
-      // TESTCHECK(startColl(args, type, op, root, in_place, 0));
-
-// #if CUDART_VERSION >= 11030
-//       if (cudaGraphLaunches >= 1) {
-//         // End cuda graph capture
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
-//         }
-//         // Instantiate cuda graph
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
-//         }
-//         // Launch cuda graph
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
-//         }
-//       }
-// #endif
-
-      // TESTCHECK(completeColl(args));
-
-// #if CUDART_VERSION >= 11030
-//       if (cudaGraphLaunches >= 1) {
-//         //destroy cuda graph
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
-//           CUDACHECK(cudaGraphDestroy(graphs[i]));
-//         }
-//       }
-// #endif
-
-      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
-
-      //aggregate delta from all threads and procs
-      Allreduce(args, &maxDelta, 3);
-  }
+
 
   double timeUsec = deltaSec*1.0E6;
   char timeStr[100];
@@ -764,26 +685,12 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
 }
 
 testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
-  // // Warm-up for large size
-  // setupArgs(args->maxbytes, type, args);
-  // for (int iter = 0; iter < warmup_iters; iter++) {
-  //   TESTCHECK(startColl(args, type, op, root, 0, iter));
-  // }
-  // TESTCHECK(completeColl(args));
-
-  // // Warm-up for small size
-  // setupArgs(args->minbytes, type, args);
-  // for (int iter = 0; iter < warmup_iters; iter++) {
-  //   TESTCHECK(startColl(args, type, op, root, 0, iter));
-  // }
-  // TESTCHECK(completeColl(args));
 
   // Benchmark
   for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
       setupArgs(size, type, args);
       print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
       TESTCHECK(BenchTime(args, type, op, root, 0));
-      // TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
   }
   return testSuccess;
diff --git a/src_simple/common_simple.cu.pure b/src_simple/common_simple.cu.pure
new file mode 100644
index 0000000..c25c0e3
--- /dev/null
+++ b/src_simple/common_simple.cu.pure
@@ -0,0 +1,1216 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_simple.h"
+#include <pthread.h>
+#include <cstdio>
+#include <getopt.h>
+#include <libgen.h>
+#include "cuda.h"
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+#if NCCL_MAJOR >= 2
+  ncclDataType_t test_types[ncclNumTypes] = {
+    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclBfloat16
+  #endif
+  };
+  const char *test_typenames[ncclNumTypes] = {
+    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , "bfloat16"
+  #endif
+  };
+  int test_typenum = -1;
+
+  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclAvg
+  #endif
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+  #endif
+  };
+  int test_opnum = -1;
+#else
+  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+  int test_typenum = 7;
+  const char *test_opnames[] = {"sum", "prod", "max", "min"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+  int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32*1024*1024;
+static size_t maxBytes = 32*1024*1024;
+static size_t stepBytes = 1*1024*1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static double parsesize(const char *value) {
+    long long int units;
+    double size;
+    char size_lit;
+
+    int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+    switch (count) {
+    case 2:
+      switch (size_lit) {
+      case 'G':
+      case 'g':
+        units = 1024*1024*1024;
+        break;
+      case 'M':
+      case 'm':
+        units = 1024*1024;
+        break;
+      case 'K':
+      case 'k':
+        units = 1024;
+        break;
+      default:
+        return -1.0;
+      };
+      break;
+    case 1:
+      units = 1;
+      break;
+    default:
+      return -1.0;
+    }
+
+    return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch(type) {
+    case ncclHalf: return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16: return 1e-2;
+#endif
+    case ncclFloat: return 1e-5;
+    case ncclDouble: return 1e-12;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+    //case ncclInt32:
+    case ncclUint32:
+#endif
+    case ncclInt64:
+    case ncclUint64: return 1e-200;
+  }
+  return 1e-200;
+}
+
+template<typename T> __device__
+double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template<> __device__
+double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y-x));
+}
+
+template<typename T> __device__
+float toFloat(T a) {
+  return (float)a;
+}
+template<> __device__
+float toFloat(half a) {
+  return __half2float(a);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<> __device__
+float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template<typename T, int BSIZE> __global__
+void deltaKern(void* A_, void* B_, size_t count, double* max) {
+  const T* A = (const T*)A_;
+  const T* B = (const T*)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if( delta > locmax ) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
+    __syncthreads();
+    if( tid < stride )
+      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
+  }
+  __syncthreads();
+  if( threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template<typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep+rank+offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template<>
+__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template<typename T>
+__device__ T ncclOpSum(T a, T b) { return a+b; }
+template<typename T>
+__device__ T ncclOpProd(T a, T b) { return a*b; }
+template<typename T>
+__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
+template<typename T>
+__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
+
+// Definitions for half
+template<>
+__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
+template<>
+__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
+template<>
+__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
+template<>
+__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
+
+template<typename T>
+__device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template<typename T>
+__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
+template<typename T>
+__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
+template<>
+__device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x)*float(arg));
+}
+template<>
+__device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x)/n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x)*float(arg));
+}
+template<>
+__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x)/n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) {
+  return 1 + rank%2;
+}
+
+template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
+__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
+    T val = testValue<T>(o+offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i=1; i<nranks; i++) {
+      T val1 = testValue<T>(o+offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
+    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
+  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template<typename T>
+__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void* const initDataKerns[ncclNumTypes] = {
+  (void*)InitDataKernel<  int8_t>,
+  (void*)InitDataKernel< uint8_t>,
+  (void*)InitDataKernel< int32_t>,
+  (void*)InitDataKernel<uint32_t>,
+  (void*)InitDataKernel< int64_t>,
+  (void*)InitDataKernel<uint64_t>,
+  (void*)InitDataKernel<    half>,
+  (void*)InitDataKernel<   float>,
+  (void*)InitDataKernel<  double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  (void*)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template<typename T>
+testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
+  T* ptr = (T*)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs* args) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  args->barrier_idx=!args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs* args, double* value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1) val += val2;
+    if (average == 2) val = std::min(val, val2);
+    if (average == 3) val = std::max(val, val2);
+  }
+  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
+    args->reduce[1-args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx=!args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int* done = (int*)malloc(sizeof(int)*ngpus);
+  memset(done, 0, sizeof(int)*ngpus);
+  while (remaining) {
+   int idle = 1;
+   for (int i=0; i<ngpus; i++) {
+     if (done[i]) continue;
+
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
+       done[i] = 1;
+       remaining--;
+       idle = 0;
+       continue;
+     }
+
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
+       ncclResult_t ncclAsyncErr;
+       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+       if (ncclAsyncErr != ncclSuccess) {
+         // An asynchronous error happened. Stop the operation and destroy
+         // the communicator
+         for (int i=0; i<ngpus; i++)
+           NCCLCHECK(ncclCommAbort(comms[i]));
+         // Abort the perf test
+         NCCLCHECK(ncclAsyncErr);
+       }
+     }
+#endif
+   }
+
+   // We might want to let other threads (including NCCL threads) use the CPU.
+   if (idle) pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
+    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    ncclRedOp_t op;
+
+    if(opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    else {
+      union {
+        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
+        half f16; float f32; double f64;
+        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+        #endif
+      };
+      int scalar = preMulScalar(rank);
+      switch(type) {
+      case ncclInt8: i8 = int8_t(scalar); break;
+      case ncclUint8: u8 = uint8_t(scalar); break;
+      case ncclInt32: i32 = int32_t(scalar); break;
+      case ncclUint32: u32 = uint32_t(scalar); break;
+      case ncclInt64: i64 = int32_t(scalar); break;
+      case ncclUint64: u64 = uint32_t(scalar); break;
+      case ncclFloat16: f16 = __float2half(float(scalar)); break;
+      case ncclFloat32: f32 = float(scalar); break;
+      case ncclFloat64: f64 = double(scalar); break;
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
+      #endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
+    }
+    #endif
+
+    TESTCHECK(args->collTest->runColl(
+          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
+          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
+        count, type, op, root, args->comms[i], args->streams[i]));
+
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    if(opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
+    }
+    #endif
+  }
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd());
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll) Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs* args) {
+  if (blocking_coll) return testSuccess;
+
+  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+  size_t count = args->nbytes / wordSize(type);
+  if (datacheck) {
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
+  }
+
+  // Sync
+  TESTCHECK(startColl(args, type, op, root, in_place, 0));
+  TESTCHECK(completeColl(args));
+
+  Barrier(args);
+
+#if CUDART_VERSION >= 11030
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
+  if (cudaGraphLaunches >= 1) {
+    // Begin cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      // Thread local mode is needed for:
+      // - Multi-thread mode
+      // - P2P pre-connect
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+    }
+  }
+#endif
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < iters; iter++) {
+    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
+    for (int aiter = 0; aiter < agg_iters; aiter++) {
+      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
+    }
+    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
+  }
+
+#if CUDART_VERSION >= 11030
+  if (cudaGraphLaunches >= 1) {
+    // End cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+    }
+    // Instantiate cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+    }
+    // Resync CPU, restart timing, launch cuda graph
+    Barrier(args);
+    start = std::chrono::high_resolution_clock::now();
+    for (int l=0; l<cudaGraphLaunches; l++) {
+      for (int i=0; i<args->nGpus; i++) {
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+      }
+    }
+  }
+#endif
+
+  TESTCHECK(completeColl(args));
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec/(iters*agg_iters);
+  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+#if CUDART_VERSION >= 11030
+  if (cudaGraphLaunches >= 1) {
+    //destroy cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
+    }
+  }
+#endif
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+
+  Barrier(args);
+
+  double maxDelta = 0;
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) {
+      // Initialize sendbuffs, recvbuffs and expected
+      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
+
+#if CUDART_VERSION >= 11030
+      if (cudaGraphLaunches >= 1) {
+        // Begin cuda graph capture for data check
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
+        }
+      }
+#endif
+
+      //test validation in single itertion, should ideally be included into the multi-iteration run
+      TESTCHECK(startColl(args, type, op, root, in_place, 0));
+
+#if CUDART_VERSION >= 11030
+      if (cudaGraphLaunches >= 1) {
+        // End cuda graph capture
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+        }
+        // Instantiate cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+        }
+        // Launch cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+        }
+      }
+#endif
+
+      TESTCHECK(completeColl(args));
+
+#if CUDART_VERSION >= 11030
+      if (cudaGraphLaunches >= 1) {
+        //destroy cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+          CUDACHECK(cudaGraphDestroy(graphs[i]));
+        }
+      }
+#endif
+
+      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+
+      //aggregate delta from all threads and procs
+      Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec*1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+  // Warm-up for large size
+  setupArgs(args->maxbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    TESTCHECK(startColl(args, type, op, root, 0, iter));
+  }
+  TESTCHECK(completeColl(args));
+
+  // Warm-up for small size
+  setupArgs(args->minbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    TESTCHECK(startColl(args, type, op, root, 0, iter));
+  }
+  TESTCHECK(completeColl(args));
+
+  // Benchmark
+  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+      setupArgs(size, type, args);
+      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+      TESTCHECK(BenchTime(args, type, op, root, 0));
+      TESTCHECK(BenchTime(args, type, op, root, 1));
+      PRINT("\n");
+  }
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs* args) {
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs* args) {
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks =  args->nProcs*args->nThreads*args->nGpus;
+
+  //set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i=0; i<args->nGpus; i++) {
+    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i=0; i<args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void* threadLauncher(void* thread_) {
+  struct testThread* thread = (struct testThread*)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread* thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
+    return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char* argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+    ncclGetVersion(&test_ncclVersion);
+  #else
+    test_ncclVersion = NCCL_VERSION_CODE;
+  #endif
+  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
+    test_opnum = 4;
+    test_typenum = 9;
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_opnum++; // ncclAvg
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+        test_typenum++; // bfloat16
+      #endif
+    }
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
+      test_opnum++; // PreMulSum
+    }
+  #endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+    {"nthreads", required_argument, 0, 't'},
+    {"ngpus", required_argument, 0, 'g'},
+    {"minbytes", required_argument, 0, 'b'},
+    {"maxbytes", required_argument, 0, 'e'},
+    {"stepbytes", required_argument, 0, 'i'},
+    {"stepfactor", required_argument, 0, 'f'},
+    {"iters", required_argument, 0, 'n'},
+    {"agg_iters", required_argument, 0, 'm'},
+    {"warmup_iters", required_argument, 0, 'w'},
+    {"parallel_init", required_argument, 0, 'p'},
+    {"check", required_argument, 0, 'c'},
+    {"op", required_argument, 0, 'o'},
+    {"datatype", required_argument, 0, 'd'},
+    {"root", required_argument, 0, 'r'},
+    {"blocking", required_argument, 0, 'z'},
+    {"cudagraph", required_argument, 0, 'G'},
+    {"average", required_argument, 0, 'a'},
+    {"help", no_argument, 0, 'h'},
+    {}
+  };
+
+  while(1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
+
+    if (c == -1)
+      break;
+
+    switch(c) {
+      case 't':
+        nThreads = strtol(optarg, NULL, 0);
+        break;
+      case 'g':
+        nGpus = strtol(optarg, NULL, 0);
+        break;
+      case 'b':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'minbytes'\n");
+          return -1;
+        }
+        minBytes = (size_t)parsed;
+        break;
+      case 'e':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+          return -1;
+        }
+        maxBytes = (size_t)parsed;
+        break;
+      case 'i':
+        stepBytes = strtol(optarg, NULL, 0);
+        break;
+      case 'f':
+        stepFactor = strtol(optarg, NULL, 0);
+        break;
+      case 'n':
+        iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+        agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+        break;
+      case 'w':
+        warmup_iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'c':
+        datacheck = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'p':
+        parallel_init = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'o':
+        ncclop = ncclstringtoop(optarg);
+        break;
+      case 'd':
+        nccltype = ncclstringtotype(optarg);
+        break;
+      case 'r':
+        ncclroot = strtol(optarg, NULL, 0);
+        break;
+      case 'z':
+        blocking_coll = strtol(optarg, NULL, 0);
+        break;
+      case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
+        cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+#endif
+        break;
+      case 'a':
+        average = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'h':
+      default:
+        if (c != 'h') printf("invalid option '%c'\n", c);
+        printf("USAGE: %s \n\t"
+            "[-t,--nthreads <num threads>] \n\t"
+            "[-g,--ngpus <gpus per thread>] \n\t"
+            "[-b,--minbytes <min size in bytes>] \n\t"
+            "[-e,--maxbytes <max size in bytes>] \n\t"
+            "[-i,--stepbytes <increment size>] \n\t"
+            "[-f,--stepfactor <increment factor>] \n\t"
+            "[-n,--iters <iteration count>] \n\t"
+            "[-m,--agg_iters <aggregated iteration count>] \n\t"
+            "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-p,--parallel_init <0/1>] \n\t"
+            "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+            "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+            "[-d,--datatype <nccltype/all>] \n\t"
+            "[-r,--root <root>] \n\t"
+            "[-z,--blocking <0/1>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
+            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+            "[-h,--help]\n",
+	    basename(argv[0]));
+	return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+           (unsigned long long)minBytes,
+           (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
+  for (int p=0; p<nProcs; p++) {
+    if (p == proc) break;
+    if (hostHashs[p] == hostHashs[proc]) localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i=0; i<nThreads*nGpus; i++) {
+    int cudaDev = localRank*nThreads*nGpus+i;
+    int rank = proc*nThreads*nGpus+i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines+MAX_LINE*p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
+  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+  }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus*nThreads];
+  void* sendbuffs[nGpus*nThreads];
+  void* recvbuffs[nGpus*nThreads];
+  void* expected[nGpus*nThreads];
+  size_t sendBytes, recvBytes;
+
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+
+  for (int i=0; i<nGpus*nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+  }
+
+  //if parallel init is not selected, use main thread to initialize NCCL
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+  if (!parallel_init) {
+     if (nProcs == 1) {
+       int gpuArray[nGpus*nThreads];
+       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
+     } else {
+       NCCLCHECK(ncclGroupStart());
+       for (int i=0; i<nGpus*nThreads; i++) {
+         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
+       }
+       NCCLCHECK(ncclGroupEnd());
+     }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double* delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t=0; t<nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int* sync = (int*)calloc(2, sizeof(int));
+  int* barrier = (int*)calloc(2, sizeof(int));
+  double* reduce = (double*)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread)*nThreads);
+
+  for (int t=nThreads-1; t>=0; t--) {
+    threads[t].args.minbytes=minBytes;
+    threads[t].args.maxbytes=maxBytes;
+    threads[t].args.stepbytes=stepBytes;
+    threads[t].args.stepfactor=stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs=nProcs;
+    threads[t].args.proc=proc;
+    threads[t].args.nThreads=nThreads;
+    threads[t].args.thread=t;
+    threads[t].args.nGpus=nGpus;
+    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
+    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
+    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms=comms+t*nGpus;
+    threads[t].args.streams=streams+t*nGpus;
+
+    threads[t].args.barrier = (volatile int*)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double*)reduce;
+    threads[t].args.sync = (volatile int*)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
+    threads[t].args.errors=errors+t;
+    threads[t].args.bw=bw+t;
+    threads[t].args.bw_count=bw_count+t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads+t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t=nThreads-1; t>=0; t--) {
+    if (t) pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for(int i=0; i<nGpus*nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i=0; i<nGpus*nThreads; i++) {
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (datacheck) CUDACHECK(cudaFree(expected[i]));
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw*(0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/src_simple/common_simple.cu.simple b/src_simple/common_simple.cu.simple
new file mode 100644
index 0000000..e3b644c
--- /dev/null
+++ b/src_simple/common_simple.cu.simple
@@ -0,0 +1,1222 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_simple.h"
+#include <pthread.h>
+#include <cstdio>
+#include <getopt.h>
+#include <libgen.h>
+#include "cuda.h"
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+#if NCCL_MAJOR >= 2
+  ncclDataType_t test_types[ncclNumTypes] = {
+    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclBfloat16
+  #endif
+  };
+  const char *test_typenames[ncclNumTypes] = {
+    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , "bfloat16"
+  #endif
+  };
+  int test_typenum = -1;
+
+  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclAvg
+  #endif
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+  #endif
+  };
+  int test_opnum = -1;
+#else
+  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+  int test_typenum = 7;
+  const char *test_opnames[] = {"sum", "prod", "max", "min"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+  int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32*1024*1024;
+static size_t maxBytes = 32*1024*1024;
+static size_t stepBytes = 1*1024*1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static double parsesize(const char *value) {
+    long long int units;
+    double size;
+    char size_lit;
+
+    int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+    switch (count) {
+    case 2:
+      switch (size_lit) {
+      case 'G':
+      case 'g':
+        units = 1024*1024*1024;
+        break;
+      case 'M':
+      case 'm':
+        units = 1024*1024;
+        break;
+      case 'K':
+      case 'k':
+        units = 1024;
+        break;
+      default:
+        return -1.0;
+      };
+      break;
+    case 1:
+      units = 1;
+      break;
+    default:
+      return -1.0;
+    }
+
+    return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch(type) {
+    case ncclHalf: return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16: return 1e-2;
+#endif
+    case ncclFloat: return 1e-5;
+    case ncclDouble: return 1e-12;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+    //case ncclInt32:
+    case ncclUint32:
+#endif
+    case ncclInt64:
+    case ncclUint64: return 1e-200;
+  }
+  return 1e-200;
+}
+
+template<typename T> __device__
+double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template<> __device__
+double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y-x));
+}
+
+template<typename T> __device__
+float toFloat(T a) {
+  return (float)a;
+}
+template<> __device__
+float toFloat(half a) {
+  return __half2float(a);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<> __device__
+float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template<typename T, int BSIZE> __global__
+void deltaKern(void* A_, void* B_, size_t count, double* max) {
+  const T* A = (const T*)A_;
+  const T* B = (const T*)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if( delta > locmax ) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
+    __syncthreads();
+    if( tid < stride )
+      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
+  }
+  __syncthreads();
+  if( threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template<typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep+rank+offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template<>
+__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template<typename T>
+__device__ T ncclOpSum(T a, T b) { return a+b; }
+template<typename T>
+__device__ T ncclOpProd(T a, T b) { return a*b; }
+template<typename T>
+__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
+template<typename T>
+__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
+
+// Definitions for half
+template<>
+__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
+template<>
+__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
+template<>
+__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
+template<>
+__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
+
+template<typename T>
+__device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template<typename T>
+__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
+template<typename T>
+__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
+template<>
+__device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x)*float(arg));
+}
+template<>
+__device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x)/n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x)*float(arg));
+}
+template<>
+__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x)/n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) {
+  return 1 + rank%2;
+}
+
+template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
+__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
+    T val = testValue<T>(o+offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i=1; i<nranks; i++) {
+      T val1 = testValue<T>(o+offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
+    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
+  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template<typename T>
+__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void* const initDataKerns[ncclNumTypes] = {
+  (void*)InitDataKernel<  int8_t>,
+  (void*)InitDataKernel< uint8_t>,
+  (void*)InitDataKernel< int32_t>,
+  (void*)InitDataKernel<uint32_t>,
+  (void*)InitDataKernel< int64_t>,
+  (void*)InitDataKernel<uint64_t>,
+  (void*)InitDataKernel<    half>,
+  (void*)InitDataKernel<   float>,
+  (void*)InitDataKernel<  double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  (void*)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template<typename T>
+testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
+  T* ptr = (T*)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs* args) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  args->barrier_idx=!args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs* args, double* value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1) val += val2;
+    if (average == 2) val = std::min(val, val2);
+    if (average == 3) val = std::max(val, val2);
+  }
+  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
+    args->reduce[1-args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx=!args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int* done = (int*)malloc(sizeof(int)*ngpus);
+  memset(done, 0, sizeof(int)*ngpus);
+  while (remaining) {
+   int idle = 1;
+   for (int i=0; i<ngpus; i++) {
+     if (done[i]) continue;
+
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
+       done[i] = 1;
+       remaining--;
+       idle = 0;
+       continue;
+     }
+
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
+       ncclResult_t ncclAsyncErr;
+       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+       if (ncclAsyncErr != ncclSuccess) {
+         // An asynchronous error happened. Stop the operation and destroy
+         // the communicator
+         for (int i=0; i<ngpus; i++)
+           NCCLCHECK(ncclCommAbort(comms[i]));
+         // Abort the perf test
+         NCCLCHECK(ncclAsyncErr);
+       }
+     }
+#endif
+   }
+
+   // We might want to let other threads (including NCCL threads) use the CPU.
+   if (idle) pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) {
+    // printf("startColl, args->nGpus > 1 run ncclGroupStart\n");
+    NCCLCHECK(ncclGroupStart());
+  }
+  for (int i = 0; i < args->nGpus; i++) {
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
+    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    ncclRedOp_t op;
+
+    if(opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    else {
+      union {
+        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
+        half f16; float f32; double f64;
+        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+        #endif
+      };
+      int scalar = preMulScalar(rank);
+      switch(type) {
+      case ncclInt8: i8 = int8_t(scalar); break;
+      case ncclUint8: u8 = uint8_t(scalar); break;
+      case ncclInt32: i32 = int32_t(scalar); break;
+      case ncclUint32: u32 = uint32_t(scalar); break;
+      case ncclInt64: i64 = int32_t(scalar); break;
+      case ncclUint64: u64 = uint32_t(scalar); break;
+      case ncclFloat16: f16 = __float2half(float(scalar)); break;
+      case ncclFloat32: f32 = float(scalar); break;
+      case ncclFloat64: f64 = double(scalar); break;
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
+      #endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
+    }
+    #endif
+
+    TESTCHECK(args->collTest->runColl(
+          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
+          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
+        count, type, op, root, args->comms[i], args->streams[i]));
+
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    if(opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
+    }
+    #endif
+  }
+  if (args->nGpus > 1) {
+    // printf("startColl, args->nGpus > 1 run ncclGroupEnd\n");
+    NCCLCHECK(ncclGroupEnd());
+  }
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll) Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs* args) {
+  if (blocking_coll) return testSuccess;
+
+  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+  size_t count = args->nbytes / wordSize(type);
+  if (datacheck) {
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
+  }
+
+  // Sync
+  TESTCHECK(startColl(args, type, op, root, in_place, 0));
+  TESTCHECK(completeColl(args));
+
+  Barrier(args);
+
+#if CUDART_VERSION >= 11030
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
+  if (cudaGraphLaunches >= 1) {
+    // Begin cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      // Thread local mode is needed for:
+      // - Multi-thread mode
+      // - P2P pre-connect
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+    }
+  }
+#endif
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < iters; iter++) {
+    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
+    for (int aiter = 0; aiter < agg_iters; aiter++) {
+      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
+    }
+    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
+  }
+
+// #if CUDART_VERSION >= 11030
+//   if (cudaGraphLaunches >= 1) {
+//     // End cuda graph capture
+//     for (int i=0; i<args->nGpus; i++) {
+//       CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+//     }
+//     // Instantiate cuda graph
+//     for (int i=0; i<args->nGpus; i++) {
+//       CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+//     }
+//     // Resync CPU, restart timing, launch cuda graph
+//     Barrier(args);
+//     start = std::chrono::high_resolution_clock::now();
+//     for (int l=0; l<cudaGraphLaunches; l++) {
+//       for (int i=0; i<args->nGpus; i++) {
+//         CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+//       }
+//     }
+//   }
+// #endif
+
+  TESTCHECK(completeColl(args));
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec/(iters*agg_iters);
+  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+// #if CUDART_VERSION >= 11030
+//   if (cudaGraphLaunches >= 1) {
+//     //destroy cuda graph
+//     for (int i=0; i<args->nGpus; i++) {
+//       CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+//       CUDACHECK(cudaGraphDestroy(graphs[i]));
+//     }
+//   }
+// #endif
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+
+  Barrier(args);
+
+  double maxDelta = 0;
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) {
+      // Initialize sendbuffs, recvbuffs and expected
+      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
+
+// #if CUDART_VERSION >= 11030
+//       if (cudaGraphLaunches >= 1) {
+//         // Begin cuda graph capture for data check
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
+//         }
+//       }
+// #endif
+
+      //test validation in single itertion, should ideally be included into the multi-iteration run
+      // TESTCHECK(startColl(args, type, op, root, in_place, 0));
+
+// #if CUDART_VERSION >= 11030
+//       if (cudaGraphLaunches >= 1) {
+//         // End cuda graph capture
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+//         }
+//         // Instantiate cuda graph
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+//         }
+//         // Launch cuda graph
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+//         }
+//       }
+// #endif
+
+      // TESTCHECK(completeColl(args));
+
+// #if CUDART_VERSION >= 11030
+//       if (cudaGraphLaunches >= 1) {
+//         //destroy cuda graph
+//         for (int i=0; i<args->nGpus; i++) {
+//           CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+//           CUDACHECK(cudaGraphDestroy(graphs[i]));
+//         }
+//       }
+// #endif
+
+      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+
+      //aggregate delta from all threads and procs
+      Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec*1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+  // // Warm-up for large size
+  // setupArgs(args->maxbytes, type, args);
+  // for (int iter = 0; iter < warmup_iters; iter++) {
+  //   TESTCHECK(startColl(args, type, op, root, 0, iter));
+  // }
+  // TESTCHECK(completeColl(args));
+
+  // // Warm-up for small size
+  // setupArgs(args->minbytes, type, args);
+  // for (int iter = 0; iter < warmup_iters; iter++) {
+  //   TESTCHECK(startColl(args, type, op, root, 0, iter));
+  // }
+  // TESTCHECK(completeColl(args));
+
+  // Benchmark
+  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+      setupArgs(size, type, args);
+      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+      TESTCHECK(BenchTime(args, type, op, root, 0));
+      // TESTCHECK(BenchTime(args, type, op, root, 1));
+      PRINT("\n");
+  }
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs* args) {
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs* args) {
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks =  args->nProcs*args->nThreads*args->nGpus;
+
+  //set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i=0; i<args->nGpus; i++) {
+    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i=0; i<args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void* threadLauncher(void* thread_) {
+  struct testThread* thread = (struct testThread*)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread* thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
+    return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char* argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+    ncclGetVersion(&test_ncclVersion);
+  #else
+    test_ncclVersion = NCCL_VERSION_CODE;
+  #endif
+  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
+    test_opnum = 4;
+    test_typenum = 9;
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_opnum++; // ncclAvg
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+        test_typenum++; // bfloat16
+      #endif
+    }
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
+      test_opnum++; // PreMulSum
+    }
+  #endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+    {"nthreads", required_argument, 0, 't'},
+    {"ngpus", required_argument, 0, 'g'},
+    {"minbytes", required_argument, 0, 'b'},
+    {"maxbytes", required_argument, 0, 'e'},
+    {"stepbytes", required_argument, 0, 'i'},
+    {"stepfactor", required_argument, 0, 'f'},
+    {"iters", required_argument, 0, 'n'},
+    {"agg_iters", required_argument, 0, 'm'},
+    {"warmup_iters", required_argument, 0, 'w'},
+    {"parallel_init", required_argument, 0, 'p'},
+    {"check", required_argument, 0, 'c'},
+    {"op", required_argument, 0, 'o'},
+    {"datatype", required_argument, 0, 'd'},
+    {"root", required_argument, 0, 'r'},
+    {"blocking", required_argument, 0, 'z'},
+    {"cudagraph", required_argument, 0, 'G'},
+    {"average", required_argument, 0, 'a'},
+    {"help", no_argument, 0, 'h'},
+    {}
+  };
+
+  while(1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
+
+    if (c == -1)
+      break;
+
+    switch(c) {
+      case 't':
+        nThreads = strtol(optarg, NULL, 0);
+        break;
+      case 'g':
+        nGpus = strtol(optarg, NULL, 0);
+        break;
+      case 'b':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'minbytes'\n");
+          return -1;
+        }
+        minBytes = (size_t)parsed;
+        break;
+      case 'e':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+          return -1;
+        }
+        maxBytes = (size_t)parsed;
+        break;
+      case 'i':
+        stepBytes = strtol(optarg, NULL, 0);
+        break;
+      case 'f':
+        stepFactor = strtol(optarg, NULL, 0);
+        break;
+      case 'n':
+        iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+        agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+        break;
+      case 'w':
+        warmup_iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'c':
+        datacheck = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'p':
+        parallel_init = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'o':
+        ncclop = ncclstringtoop(optarg);
+        break;
+      case 'd':
+        nccltype = ncclstringtotype(optarg);
+        break;
+      case 'r':
+        ncclroot = strtol(optarg, NULL, 0);
+        break;
+      case 'z':
+        blocking_coll = strtol(optarg, NULL, 0);
+        break;
+      case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
+        cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+#endif
+        break;
+      case 'a':
+        average = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'h':
+      default:
+        if (c != 'h') printf("invalid option '%c'\n", c);
+        printf("USAGE: %s \n\t"
+            "[-t,--nthreads <num threads>] \n\t"
+            "[-g,--ngpus <gpus per thread>] \n\t"
+            "[-b,--minbytes <min size in bytes>] \n\t"
+            "[-e,--maxbytes <max size in bytes>] \n\t"
+            "[-i,--stepbytes <increment size>] \n\t"
+            "[-f,--stepfactor <increment factor>] \n\t"
+            "[-n,--iters <iteration count>] \n\t"
+            "[-m,--agg_iters <aggregated iteration count>] \n\t"
+            "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-p,--parallel_init <0/1>] \n\t"
+            "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+            "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+            "[-d,--datatype <nccltype/all>] \n\t"
+            "[-r,--root <root>] \n\t"
+            "[-z,--blocking <0/1>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
+            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+            "[-h,--help]\n",
+	    basename(argv[0]));
+	return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+           (unsigned long long)minBytes,
+           (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
+  for (int p=0; p<nProcs; p++) {
+    if (p == proc) break;
+    if (hostHashs[p] == hostHashs[proc]) localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i=0; i<nThreads*nGpus; i++) {
+    int cudaDev = localRank*nThreads*nGpus+i;
+    int rank = proc*nThreads*nGpus+i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines+MAX_LINE*p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
+  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+  }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus*nThreads];
+  void* sendbuffs[nGpus*nThreads];
+  void* recvbuffs[nGpus*nThreads];
+  void* expected[nGpus*nThreads];
+  size_t sendBytes, recvBytes;
+
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+
+  for (int i=0; i<nGpus*nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+  }
+
+  //if parallel init is not selected, use main thread to initialize NCCL
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+  if (!parallel_init) {
+     if (nProcs == 1) {
+       int gpuArray[nGpus*nThreads];
+       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
+     } else {
+       NCCLCHECK(ncclGroupStart());
+       for (int i=0; i<nGpus*nThreads; i++) {
+         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
+       }
+       NCCLCHECK(ncclGroupEnd());
+     }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double* delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t=0; t<nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int* sync = (int*)calloc(2, sizeof(int));
+  int* barrier = (int*)calloc(2, sizeof(int));
+  double* reduce = (double*)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread)*nThreads);
+
+  for (int t=nThreads-1; t>=0; t--) {
+    threads[t].args.minbytes=minBytes;
+    threads[t].args.maxbytes=maxBytes;
+    threads[t].args.stepbytes=stepBytes;
+    threads[t].args.stepfactor=stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs=nProcs;
+    threads[t].args.proc=proc;
+    threads[t].args.nThreads=nThreads;
+    threads[t].args.thread=t;
+    threads[t].args.nGpus=nGpus;
+    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
+    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
+    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms=comms+t*nGpus;
+    threads[t].args.streams=streams+t*nGpus;
+
+    threads[t].args.barrier = (volatile int*)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double*)reduce;
+    threads[t].args.sync = (volatile int*)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
+    threads[t].args.errors=errors+t;
+    threads[t].args.bw=bw+t;
+    threads[t].args.bw_count=bw_count+t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads+t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t=nThreads-1; t>=0; t--) {
+    if (t) pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for(int i=0; i<nGpus*nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i=0; i<nGpus*nThreads; i++) {
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (datacheck) CUDACHECK(cudaFree(expected[i]));
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw*(0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}

From 02a914ea6362230dd44a43f58cf6f6f090ed085f Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 18 Jul 2022 02:45:09 +0000
Subject: [PATCH 008/109] ofccl test file

---
 .gitignore                     |  2 ++
 src_simple/ofccl_all_reduce.cu | 27 +++------------------------
 2 files changed, 5 insertions(+), 24 deletions(-)

diff --git a/.gitignore b/.gitignore
index a0a013e..c908b05 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@
 #
 # See LICENCE.txt for license information
 /build
+
+.clangd
\ No newline at end of file
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 9a702ec..9d3ad0c 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -56,30 +56,9 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
 }
 
 testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  static int round;
-  ncclGroupStart();
-  printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round);
-
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 2nd allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 5th allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 6th allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 7th allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 8th allreduce\n", getpid(), round);
-
-  ncclGroupEnd();
-  printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round);
-  round++;
+
+  NCCLCHECK(ofcclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  printf("<%d> ofccl_nccl_test invoke ofcclAllReduce\n", getpid());
   return testSuccess;
 }
 

From 8eba16feff90fa7aa6a2e97965539c738797722f Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 18 Jul 2022 08:54:42 +0000
Subject: [PATCH 009/109] run startColl exactly as we want

---
 src_simple/common_simple.cu | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 0d88bb3..ba44d36 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -601,26 +601,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
   }
 
-  // Sync
-  TESTCHECK(startColl(args, type, op, root, in_place, 0));
-  TESTCHECK(completeColl(args));
-
   Barrier(args);
 
-#if CUDART_VERSION >= 11030
-  cudaGraph_t graphs[args->nGpus];
-  cudaGraphExec_t graphExec[args->nGpus];
-  if (cudaGraphLaunches >= 1) {
-    // Begin cuda graph capture
-    for (int i=0; i<args->nGpus; i++) {
-      // Thread local mode is needed for:
-      // - Multi-thread mode
-      // - P2P pre-connect
-      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
-    }
-  }
-#endif
-
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {

From d6a4d47eddb59d905c4e68928dc0cd2d570ea305 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 18 Jul 2022 10:54:47 +0000
Subject: [PATCH 010/109] ofccl_all_reduce.cu

---
 src_simple/ofccl_all_reduce.cu | 64 ++++++++++++++++------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 9d3ad0c..62f8b69 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -77,40 +77,36 @@ void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, in
 
 testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
   args->collTest = &allReduceTest;
-  // ncclDataType_t *run_types;
-  // ncclRedOp_t *run_ops;
-  // const char **run_typenames, **run_opnames;
-  // int type_count, op_count;
-
-  // if ((int)type != -1) {
-  //   type_count = 1;
-  //   run_types = &type;
-  //   run_typenames = &typeName;
-  // } else {
-  //   type_count = test_typenum;
-  //   run_types = test_types;
-  //   run_typenames = test_typenames;
-  // }
-
-  // if ((int)op != -1) {
-  //   op_count = 1;
-  //   run_ops = &op;
-  //   run_opnames = &opName;
-  // } else {
-  //   op_count = test_opnum;
-  //   run_ops = test_ops;
-  //   run_opnames = test_opnames;
-  // }
-
-  // for (int i=0; i<type_count; i++) {
-  //   for (int j=0; j<op_count; j++) {
-  //     TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
-  //   }
-  // }
-  static int test_round = 0;
-  printf("<%d> %d ofccl_nccl_test invoke TimeTest\n", getpid(), test_round);
-  test_round++;
-  TESTCHECK(TimeTest(args, ncclFloat, "float", ncclSum, "sum", -1));
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
   return testSuccess;
 }
 

From 5e3c98d722a85ee845627189a4f2a8ccbc477a71 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Tue, 19 Jul 2022 15:25:50 +0000
Subject: [PATCH 011/109] add log

---
 src_simple/common_simple.cu | 5 +++++
 src_simple/common_simple.h  | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index ba44d36..e8fc1a6 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -679,6 +679,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
 }
 
 testResult_t threadRunTests(struct threadArgs* args) {
+  // OFTEST_LOG1(TEST, "Enter threadRunTests");
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
@@ -689,6 +690,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
 }
 
 testResult_t threadInit(struct threadArgs* args) {
+  // OFTEST_LOG1(TEST, "Enter threadInit");
   char hostname[1024];
   getHostName(hostname, 1024);
   int nranks =  args->nProcs*args->nThreads*args->nGpus;
@@ -701,6 +703,7 @@ testResult_t threadInit(struct threadArgs* args) {
     int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
     CUDACHECK(cudaSetDevice(gpuid));
+    // OFTEST_LOG1(TEST, "CommInitRank here");
     NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
   }
   NCCLCHECK(ncclGroupEnd());
@@ -992,11 +995,13 @@ testResult_t run() {
      if (nProcs == 1) {
        int gpuArray[nGpus*nThreads];
        for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
+       OFTEST_LOG1(TEST, "CommInitAll here");
        NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
      } else {
        NCCLCHECK(ncclGroupStart());
        for (int i=0; i<nGpus*nThreads; i++) {
          CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+        //  OFTEST_LOG1(TEST, "CommInitRank here");
          NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
        }
        NCCLCHECK(ncclGroupEnd());
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index bd84d01..caaafef 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -16,6 +16,10 @@
 #include <pthread.h>
 #include "nccl1_compat.h"
 
+#define OFTEST_LOG(PRE, FMT, args...) printf("\nTEST [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__, args)
+#define OFTEST_LOG1(PRE, FMT) printf("\nTEST [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__)
+#define OFTEST_LOG0(PRE) printf("\nTEST [%s:%d] <%s> " #PRE, __FILE__, __LINE__, __func__)
+
 #define CUDACHECK(cmd) do {                         \
   cudaError_t err = cmd;                            \
   if( err != cudaSuccess ) {                        \

From d3de0211576a5e9002908f25ce67f6abdcf0424f Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Tue, 19 Jul 2022 18:07:50 +0000
Subject: [PATCH 012/109] add -M option: use seprate ncclComm for different
 coll op, even with the same dev set

---
 README.md                          |    1 +
 src_simple/common_simple.cu        | 1459 ++++++++++++++++------------
 src_simple/common_simple.cu.simple |   40 +-
 3 files changed, 853 insertions(+), 647 deletions(-)

diff --git a/README.md b/README.md
index bff6433..1c3c505 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ All tests support the same set of arguments :
   * `-n,--iters <iteration count>` number of iterations. Default : 20.
   * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-M,--multi_iters <multi seprate ncclComm iteration count>` number of operations with seprate ncclComm in each iteration. Default : 1.
   * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index e8fc1a6..bb64ebc 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -5,46 +5,74 @@
  ************************************************************************/
 
 #include "common_simple.h"
-#include <pthread.h>
+#include "cuda.h"
+#include "nccl.h"
 #include <cstdio>
+#include <cstring>
 #include <getopt.h>
 #include <libgen.h>
-#include "cuda.h"
+#include <pthread.h>
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 #if NCCL_MAJOR >= 2
-  ncclDataType_t test_types[ncclNumTypes] = {
-    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-    , ncclBfloat16
-  #endif
-  };
-  const char *test_typenames[ncclNumTypes] = {
-    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-    , "bfloat16"
-  #endif
-  };
-  int test_typenum = -1;
-
-  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
-  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-    , ncclAvg
-  #endif
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
-  #endif
-  };
-  int test_opnum = -1;
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8,
+                                           ncclUint8,
+                                           ncclInt32,
+                                           ncclUint32,
+                                           ncclInt64,
+                                           ncclUint64,
+                                           ncclHalf,
+                                           ncclFloat,
+                                           ncclDouble
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                           ,
+                                           ncclBfloat16
+#endif
+};
+const char *test_typenames[ncclNumTypes] = {"int8",
+                                            "uint8",
+                                            "int32",
+                                            "uint32",
+                                            "int64",
+                                            "uint64",
+                                            "half",
+                                            "float",
+                                            "double"
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                            ,
+                                            "bfloat16"
+#endif
+};
+int test_typenum = -1;
+
+const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+ncclRedOp_t test_ops[] = {
+    ncclSum,
+    ncclProd,
+    ncclMax,
+    ncclMin
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    ,
+    ncclAvg
+#endif
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    ,
+    ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+#endif
+};
+int test_opnum = -1;
 #else
-  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
-  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
-  int test_typenum = 7;
-  const char *test_opnames[] = {"sum", "prod", "max", "min"};
-  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
-  int test_opnum = 4;
+ncclDataType_t test_types[ncclNumTypes] = {
+    ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+const char *test_typenames[ncclNumTypes] = {"char",   "int",   "half",  "float",
+                                            "double", "int64", "uint64"};
+int test_typenum = 7;
+const char *test_opnames[] = {"sum", "prod", "max", "min"};
+ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+int test_opnum = 4;
 #endif
 
 thread_local int is_main_thread = 0;
@@ -52,14 +80,15 @@ thread_local int is_main_thread = 0;
 // Command line parameter defaults
 static int nThreads = 1;
 static int nGpus = 1;
-static size_t minBytes = 32*1024*1024;
-static size_t maxBytes = 32*1024*1024;
-static size_t stepBytes = 1*1024*1024;
+static size_t minBytes = 32 * 1024 * 1024;
+static size_t maxBytes = 32 * 1024 * 1024;
+static size_t stepBytes = 1 * 1024 * 1024;
 static size_t stepFactor = 1;
 static int datacheck = 1;
 static int warmup_iters = 5;
 static int iters = 20;
 static int agg_iters = 1;
+static int multi_iters = 1;
 static int ncclop = ncclSum;
 static int nccltype = ncclFloat;
 static int ncclroot = 0;
@@ -72,234 +101,251 @@ static int average = 1;
 #define NUM_BLOCKS 32
 
 static double parsesize(const char *value) {
-    long long int units;
-    double size;
-    char size_lit;
-
-    int count = sscanf(value, "%lf %1s", &size, &size_lit);
-
-    switch (count) {
-    case 2:
-      switch (size_lit) {
-      case 'G':
-      case 'g':
-        units = 1024*1024*1024;
-        break;
-      case 'M':
-      case 'm':
-        units = 1024*1024;
-        break;
-      case 'K':
-      case 'k':
-        units = 1024;
-        break;
-      default:
-        return -1.0;
-      };
+  long long int units;
+  double size;
+  char size_lit;
+
+  int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+  switch (count) {
+  case 2:
+    switch (size_lit) {
+    case 'G':
+    case 'g':
+      units = 1024 * 1024 * 1024;
       break;
-    case 1:
-      units = 1;
+    case 'M':
+    case 'm':
+      units = 1024 * 1024;
+      break;
+    case 'K':
+    case 'k':
+      units = 1024;
       break;
     default:
       return -1.0;
-    }
+    };
+    break;
+  case 1:
+    units = 1;
+    break;
+  default:
+    return -1.0;
+  }
 
-    return size * units;
+  return size * units;
 }
 
 double DeltaMaxValue(ncclDataType_t type) {
-  switch(type) {
-    case ncclHalf: return 1e-2;
+  switch (type) {
+  case ncclHalf:
+    return 1e-2;
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16: return 1e-2;
+  case ncclBfloat16:
+    return 1e-2;
 #endif
-    case ncclFloat: return 1e-5;
-    case ncclDouble: return 1e-12;
-    case ncclInt:
+  case ncclFloat:
+    return 1e-5;
+  case ncclDouble:
+    return 1e-12;
+  case ncclInt:
 #if NCCL_MAJOR >= 2
-    case ncclUint8:
-    //case ncclInt32:
-    case ncclUint32:
+  case ncclUint8:
+  // case ncclInt32:
+  case ncclUint32:
 #endif
-    case ncclInt64:
-    case ncclUint64: return 1e-200;
+  case ncclInt64:
+  case ncclUint64:
+    return 1e-200;
   }
   return 1e-200;
 }
 
-template<typename T> __device__
-double absDiff(T a, T b) {
+template <typename T> __device__ double absDiff(T a, T b) {
   return fabs((double)(b - a));
 }
 
-template<> __device__
-double absDiff<half>(half a, half b) {
+template <> __device__ double absDiff<half>(half a, half b) {
   float x = __half2float(a);
   float y = __half2float(b);
-  return fabs((double)(y-x));
+  return fabs((double)(y - x));
 }
 
-template<typename T> __device__
-float toFloat(T a) {
-  return (float)a;
-}
-template<> __device__
-float toFloat(half a) {
-  return __half2float(a);
-}
+template <typename T> __device__ float toFloat(T a) { return (float)a; }
+template <> __device__ float toFloat(half a) { return __half2float(a); }
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-template<> __device__
-float toFloat(__nv_bfloat16 a) {
+template <> __device__ float toFloat(__nv_bfloat16 a) {
   return __bfloat162float(a);
 }
 #endif
 
-template<typename T, int BSIZE> __global__
-void deltaKern(void* A_, void* B_, size_t count, double* max) {
-  const T* A = (const T*)A_;
-  const T* B = (const T*)B_;
+template <typename T, int BSIZE>
+__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) {
+  const T *A = (const T *)A_;
+  const T *B = (const T *)B_;
   __shared__ double temp[BSIZE];
-  int tid = blockIdx.x*blockDim.x + threadIdx.x;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
   double locmax = 0.0;
-  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
+  for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) {
 
     double delta = absDiff(A[i], B[i]);
-    if( delta > locmax ) {
+    if (delta > locmax) {
       locmax = delta;
 #ifdef DEBUG_PRINT
-      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
+      if (delta > .1)
+        printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i,
+               toFloat(A[i]), toFloat(B[i]));
 #endif
     }
   }
 
   tid = threadIdx.x;
   temp[tid] = locmax;
-  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
+  for (int stride = BSIZE / 2; stride > 1; stride >>= 1) {
     __syncthreads();
-    if( tid < stride )
-      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
+    if (tid < stride)
+      temp[tid] =
+          temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride];
   }
   __syncthreads();
-  if( threadIdx.x == 0)
+  if (threadIdx.x == 0)
     max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
 }
 
-testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+testResult_t CheckDelta(void *results, void *expected, size_t count,
+                        ncclDataType_t type, double *devmax) {
   switch (type) {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16:
-      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  case ncclBfloat16:
+    deltaKern<__nv_bfloat16, 512>
+        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
+    break;
 #endif
-    case ncclHalf:
-      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclFloat:
-      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclDouble:
-      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-
-    case ncclChar:
+  case ncclHalf:
+    deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
+    break;
+  case ncclFloat:
+    deltaKern<float, 512>
+        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
+    break;
+  case ncclDouble:
+    deltaKern<double, 512>
+        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
+    break;
+
+  case ncclChar:
 #if NCCL_MAJOR >= 2
-    case ncclUint8:
+  case ncclUint8:
 #endif
-      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclInt:
+    deltaKern<uint8_t, 512>
+        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
+    break;
+  case ncclInt:
 #if NCCL_MAJOR >= 2
-    case ncclUint32:
+  case ncclUint32:
 #endif
-      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclInt64:
-    case ncclUint64:
-      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    deltaKern<uint32_t, 512>
+        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
+    break;
+  case ncclInt64:
+  case ncclUint64:
+    deltaKern<uint64_t, 512>
+        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
+    break;
   }
   CUDACHECK(cudaDeviceSynchronize());
-  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  for (int i = 1; i < NUM_BLOCKS; i++)
+    devmax[0] = std::max(devmax[0], devmax[i]);
   return testSuccess;
 }
 
 // For integer values, we use values between 0 and 255
-template<typename T>
+template <typename T>
 __device__ T testValue(const size_t offset, const int rep, const int rank) {
-  uint8_t v = (rep+rank+offset) % 256;
+  uint8_t v = (rep + rank + offset) % 256;
   return (T)v;
 }
 
 // For floating point datatype, we use values between 0 and 1 otherwise the
 // Product operation will produce NaNs.
-template<>
-__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
+template <>
+__device__ double testValue<double>(const size_t offset, const int rep,
+                                    const int rank) {
+  return 1.0 / (1.0 + (double)testValue<int>(offset, rep, rank));
 }
-template<>
-__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
+template <>
+__device__ float testValue<float>(const size_t offset, const int rep,
+                                  const int rank) {
+  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
 }
-template<>
-__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
+template <>
+__device__ half testValue<half>(const size_t offset, const int rep,
+                                const int rank) {
   return __float2half(testValue<float>(offset, rep, rank));
 }
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
+template <>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset,
+                                                  const int rep,
+                                                  const int rank) {
   return __float2bfloat16(testValue<float>(offset, rep, rank));
 }
 #endif
 
 // Operations
-template<typename T>
-__device__ T ncclOpSum(T a, T b) { return a+b; }
-template<typename T>
-__device__ T ncclOpProd(T a, T b) { return a*b; }
-template<typename T>
-__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
-template<typename T>
-__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
+template <typename T> __device__ T ncclOpSum(T a, T b) { return a + b; }
+template <typename T> __device__ T ncclOpProd(T a, T b) { return a * b; }
+template <typename T> __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; }
+template <typename T> __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; }
 
 // Definitions for half
-template<>
-__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
-template<>
-__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
-template<>
-__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
-template<>
-__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
-
-template<typename T>
-__device__ T ncclPPOpIdent(T x, int arg) { return x; }
-template<typename T>
-__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
-template<typename T>
-__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
-template<>
-__device__ half ncclPPOpMul(half x, int arg) {
-  return __float2half(__half2float(x)*float(arg));
+template <> __device__ half ncclOpSum(half a, half b) {
+  return __float2half(__half2float(a) + __half2float(b));
+}
+template <> __device__ half ncclOpProd(half a, half b) {
+  return __float2half(__half2float(a) * __half2float(b));
+}
+template <> __device__ half ncclOpMax(half a, half b) {
+  return __half2float(a) > __half2float(b) ? a : b;
+}
+template <> __device__ half ncclOpMin(half a, half b) {
+  return __half2float(a) < __half2float(b) ? a : b;
+}
+
+template <typename T> __device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template <typename T> __device__ T ncclPPOpMul(T x, int arg) {
+  return x * T(arg);
 }
-template<>
-__device__ half ncclPPOpDiv(half x, int n) {
-  return __float2half(__half2float(x)/n);
+template <typename T> __device__ T ncclPPOpDiv(T x, int arg) {
+  return x / T(arg);
+}
+template <> __device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x) * float(arg));
+}
+template <> __device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x) / n);
 }
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
-  return __float2bfloat16(__bfloat162float(x)*float(arg));
+template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x) * float(arg));
 }
-template<>
-__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
-  return __float2bfloat16(__bfloat162float(x)/n);
+template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x) / n);
 }
 #endif
 
-__host__ __device__ int preMulScalar(int rank) {
-  return 1 + rank%2;
-}
+__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; }
 
-template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
-__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
-    T val = testValue<T>(o+offset, rep, 0);
+template <typename T, T (*Op)(T, T), T (*PreOp)(T, int), T (*PostOp)(T, int)>
+__global__ void InitDataReduceKernel(T *data, const size_t N,
+                                     const size_t offset, const int rep,
+                                     const int nranks) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x) {
+    T val = testValue<T>(o + offset, rep, 0);
     val = PreOp(val, preMulScalar(0));
-    for (int i=1; i<nranks; i++) {
-      T val1 = testValue<T>(o+offset, rep, i);
+    for (int i = 1; i < nranks; i++) {
+      T val1 = testValue<T>(o + offset, rep, i);
       val1 = PreOp(val1, preMulScalar(i));
       val = Op(val, val1);
     }
@@ -307,212 +353,243 @@ __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offse
   }
 }
 
-#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
-    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
-#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#define KERN(type, op, preop, postop)                                          \
+  (void *)InitDataReduceKernel<type, op<type>, preop<type>, postop<type>>
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv),               \
+      KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
 #else
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
 #endif
 
-static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
-  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  OPS(__nv_bfloat16)
+static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = {
+    OPS(int8_t),       OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t),
+    OPS(uint64_t),     OPS(half),    OPS(float),   OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    OPS(__nv_bfloat16)
 #endif
 };
 
-testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
-  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
+testResult_t InitDataReduce(void *data, const size_t count, const size_t offset,
+                            ncclDataType_t type, ncclRedOp_t op, const int rep,
+                            const int nranks) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep,
+                   (void *)&nranks};
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid,
+                             block, args, 0, cudaStreamDefault));
   return testSuccess;
 }
 
-template<typename T>
-__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
+template <typename T>
+__global__ void InitDataKernel(T *data, const size_t N, const int rep,
+                               const int rank) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x)
     data[o] = testValue<T>(o, rep, rank);
 }
 
-static void* const initDataKerns[ncclNumTypes] = {
-  (void*)InitDataKernel<  int8_t>,
-  (void*)InitDataKernel< uint8_t>,
-  (void*)InitDataKernel< int32_t>,
-  (void*)InitDataKernel<uint32_t>,
-  (void*)InitDataKernel< int64_t>,
-  (void*)InitDataKernel<uint64_t>,
-  (void*)InitDataKernel<    half>,
-  (void*)InitDataKernel<   float>,
-  (void*)InitDataKernel<  double>,
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  (void*)InitDataKernel<__nv_bfloat16>
+static void *const initDataKerns[ncclNumTypes] = {
+    (void *)InitDataKernel<int8_t>,       (void *)InitDataKernel<uint8_t>,
+    (void *)InitDataKernel<int32_t>,      (void *)InitDataKernel<uint32_t>,
+    (void *)InitDataKernel<int64_t>,      (void *)InitDataKernel<uint64_t>,
+    (void *)InitDataKernel<half>,         (void *)InitDataKernel<float>,
+    (void *)InitDataKernel<double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    (void *)InitDataKernel<__nv_bfloat16>
 #endif
 };
 
-template<typename T>
-testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
-  T* ptr = (T*)dest;
+template <typename T>
+testResult_t InitDataType(void *dest, const size_t N, const int rep,
+                          const int rank) {
+  T *ptr = (T *)dest;
   InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
   return testSuccess;
 }
 
-testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
-  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+testResult_t InitData(void *data, const size_t count, ncclDataType_t type,
+                      const int rep, const int rank) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank};
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0,
+                             cudaStreamDefault));
   return testSuccess;
 }
 
-void Barrier(struct threadArgs* args) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+void Barrier(struct threadArgs *args) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
   args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
+  if (args->thread + 1 == args->nThreads) {
 #ifdef MPI_SUPPORT
     MPI_Barrier(MPI_COMM_WORLD);
 #endif
     args->barrier[args->barrier_idx] = 0;
   } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
   }
-  args->barrier_idx=!args->barrier_idx;
+  args->barrier_idx = !args->barrier_idx;
 }
 
 // Inter-thread/process barrier+allreduce
-void Allreduce(struct threadArgs* args, double* value, int average) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+void Allreduce(struct threadArgs *args, double *value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
   double val = *value;
   if (args->thread > 0) {
     double val2 = args->reduce[args->barrier_idx];
-    if (average == 1) val += val2;
-    if (average == 2) val = std::min(val, val2);
-    if (average == 3) val = std::max(val, val2);
+    if (average == 1)
+      val += val2;
+    if (average == 2)
+      val = std::min(val, val2);
+    if (average == 3)
+      val = std::max(val, val2);
   }
-  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
+  if (average || args->thread == 0)
+    args->reduce[args->barrier_idx] = val;
   args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
+  if (args->thread + 1 == args->nThreads) {
 #ifdef MPI_SUPPORT
     if (average != 0) {
       MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
-      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
+      MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1,
+                    MPI_DOUBLE, op, MPI_COMM_WORLD);
     }
 #endif
-    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
-    args->reduce[1-args->barrier_idx] = 0;
+    if (average == 1)
+      args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads;
+    args->reduce[1 - args->barrier_idx] = 0;
     args->barrier[args->barrier_idx] = 0;
   } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
   }
   *value = args->reduce[args->barrier_idx];
-  args->barrier_idx=!args->barrier_idx;
+  args->barrier_idx = !args->barrier_idx;
 }
 
-testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
-  size_t count = args->expectedBytes/wordSize(type);
+testResult_t CheckData(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes / wordSize(type);
   double maxDelta = 0.0;
-  for (int i=0; i<args->nGpus; i++) {
+  for (int i = 0; i < args->nGpus; i++) {
     int device;
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
     CUDACHECK(cudaSetDevice(device));
-    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
-    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] +
+                                      args->recvInplaceOffset * rank))
+                          : args->recvbuffs[i];
+    TESTCHECK(
+        CheckDelta(data, args->expected[i], count, type, args->deltaHost));
     maxDelta = std::max(*(args->deltaHost), maxDelta);
 
 #ifdef DEBUG_PRINT
     if (rank == 0) {
-       int *expectedHost = (int *)malloc(args->expectedBytes);
-       int *dataHost = (int *)malloc(args->expectedBytes);
-
-       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
-       printf("\n Expected: ");
-       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, expectedHost[j]);
-       }
-       printf("\n");
-
-       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
-       printf("\n Actual: ");
-       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, dataHost[j]);
-       }
-       printf("\n");
-       free(expectedHost);
-       free(dataHost);
+      int *expectedHost = (int *)malloc(args->expectedBytes);
+      int *dataHost = (int *)malloc(args->expectedBytes);
+
+      cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes,
+                 cudaMemcpyDeviceToHost);
+      printf("\n Expected: ");
+      for (int j = 0; j < args->expectedBytes / sizeof(int); j++) {
+        printf("%d:%d ", j, expectedHost[j]);
+      }
+      printf("\n");
+
+      cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+      printf("\n Actual: ");
+      for (int j = 0; j < args->expectedBytes / sizeof(int); j++) {
+        printf("%d:%d ", j, dataHost[j]);
+      }
+      printf("\n");
+      free(expectedHost);
+      free(dataHost);
     }
 #endif
   }
-  double nranks = args->nProcs*args->nThreads*args->nGpus;
-  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  double nranks = args->nProcs * args->nThreads * args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type) * (nranks - 1))
+    args->errors[0]++;
   *delta = maxDelta;
   return testSuccess;
 }
 
-testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams,
+                                   ncclComm_t *comms) {
   cudaError_t cudaErr;
   int remaining = ngpus;
-  int* done = (int*)malloc(sizeof(int)*ngpus);
-  memset(done, 0, sizeof(int)*ngpus);
+  int *done = (int *)malloc(sizeof(int) * ngpus);
+  memset(done, 0, sizeof(int) * ngpus);
   while (remaining) {
-   int idle = 1;
-   for (int i=0; i<ngpus; i++) {
-     if (done[i]) continue;
-
-     cudaErr = cudaStreamQuery(streams[i]);
-     if (cudaErr == cudaSuccess) {
-       done[i] = 1;
-       remaining--;
-       idle = 0;
-       continue;
-     }
-
-     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
-
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
-       ncclResult_t ncclAsyncErr;
-       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
-       if (ncclAsyncErr != ncclSuccess) {
-         // An asynchronous error happened. Stop the operation and destroy
-         // the communicator
-         for (int i=0; i<ngpus; i++)
-           NCCLCHECK(ncclCommAbort(comms[i]));
-         // Abort the perf test
-         NCCLCHECK(ncclAsyncErr);
-       }
-     }
+    int idle = 1;
+    for (int i = 0; i < ngpus; i++) {
+      if (done[i])
+        continue;
+
+      cudaErr = cudaStreamQuery(streams[i]);
+      if (cudaErr == cudaSuccess) {
+        done[i] = 1;
+        remaining--;
+        idle = 0;
+        continue;
+      }
+
+      if (cudaErr != cudaErrorNotReady)
+        CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+      if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) {
+        ncclResult_t ncclAsyncErr;
+        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+        if (ncclAsyncErr != ncclSuccess) {
+          // An asynchronous error happened. Stop the operation and destroy
+          // the communicator
+          for (int i = 0; i < ngpus; i++)
+            NCCLCHECK(ncclCommAbort(comms[i]));
+          // Abort the perf test
+          NCCLCHECK(ncclAsyncErr);
+        }
+      }
 #endif
-   }
+    }
 
-   // We might want to let other threads (including NCCL threads) use the CPU.
-   if (idle) pthread_yield();
+    // We might want to let other threads (including NCCL threads) use the CPU.
+    if (idle)
+      pthread_yield();
   }
   free(done);
   return testSuccess;
 }
 
-testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
+testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) {
   size_t count = args->nbytes / wordSize(type);
 
-  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
   size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
   size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
   size_t shift = totalnbytes * (iter % steps);
@@ -522,57 +599,89 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     NCCLCHECK(ncclGroupStart());
   }
   for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm);
 #ifndef NCCL_MAJOR
     int cudaDev;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
+    NCCLCHECK(ncclCommCuDevice(comm, &cudaDev));
     CUDACHECK(cudaSetDevice(cudaDev));
 #endif
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
-    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    char *recvBuff = ((char *)args->recvbuffs[i]) + shift;
+    char *sendBuff = ((char *)args->sendbuffs[i]) + shift;
     ncclRedOp_t op;
 
-    if(opIndex < ncclNumOps) {
+    if (opIndex < ncclNumOps) {
       op = opIndex;
     }
-    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
     else {
       union {
-        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
-        half f16; float f32; double f64;
-        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
         __nv_bfloat16 bf16;
-        #endif
+#endif
       };
       int scalar = preMulScalar(rank);
-      switch(type) {
-      case ncclInt8: i8 = int8_t(scalar); break;
-      case ncclUint8: u8 = uint8_t(scalar); break;
-      case ncclInt32: i32 = int32_t(scalar); break;
-      case ncclUint32: u32 = uint32_t(scalar); break;
-      case ncclInt64: i64 = int32_t(scalar); break;
-      case ncclUint64: u64 = uint32_t(scalar); break;
-      case ncclFloat16: f16 = __float2half(float(scalar)); break;
-      case ncclFloat32: f32 = float(scalar); break;
-      case ncclFloat64: f64 = double(scalar); break;
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
-      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
-      #endif
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
       }
-      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
     }
-    #endif
+#endif
 
     TESTCHECK(args->collTest->runColl(
-          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
-          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
-        count, type, op, root, args->comms[i], args->streams[i]));
-
-    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-    if(opIndex >= ncclNumOps) {
-      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
+        (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank
+                          : sendBuff),
+        (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank
+                          : recvBuff),
+        count, type, op, root, comm, args->streams[i]));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
     }
-    #endif
+#endif
   }
   if (args->nGpus > 1) {
     printf("\nstartColl, args->nGpus > 1 run ncclGroupEnd\n");
@@ -583,18 +692,21 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     // Complete op before returning
     TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
   }
-  if (blocking_coll) Barrier(args);
+  if (blocking_coll)
+    Barrier(args);
   return testSuccess;
 }
 
-testResult_t completeColl(struct threadArgs* args) {
-  if (blocking_coll) return testSuccess;
+testResult_t completeColl(struct threadArgs *args) {
+  if (blocking_coll)
+    return testSuccess;
 
   TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
   return testSuccess;
 }
 
-testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
   if (datacheck) {
     // Initialize sendbuffs, recvbuffs and expected
@@ -606,24 +718,36 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
-    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
-    for (int aiter = 0; aiter < agg_iters; aiter++) {
-      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
+    if (multi_iters > 1) {
+      for (int miter = 0; miter < multi_iters; miter++) {
+        TESTCHECK(startColl(args, type, op, root, in_place,
+                            iter * multi_iters + miter, miter));
+      }
+    } else {
+      if (agg_iters > 1)
+        NCCLCHECK(ncclGroupStart());
+      for (int aiter = 0; aiter < agg_iters; aiter++) {
+        TESTCHECK(startColl(args, type, op, root, in_place,
+                            iter * agg_iters + aiter, 0));
+      }
+      if (agg_iters > 1)
+        NCCLCHECK(ncclGroupEnd());
     }
-    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
   }
 
   TESTCHECK(completeColl(args));
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
-  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
-  deltaSec = deltaSec/(iters*agg_iters);
-  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+  double deltaSec =
+      std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec / (iters * agg_iters *multi_iters);
+  if (cudaGraphLaunches >= 1)
+    deltaSec = deltaSec / cudaGraphLaunches;
   Allreduce(args, &deltaSec, average);
 
-
   double algBw, busBw;
-  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw,
+                        args->nProcs * args->nThreads * args->nGpus);
 
   Barrier(args);
 
@@ -631,8 +755,22 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   static __thread int rep = 0;
   rep++;
 
+  if (datacheck) {
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
+
+    //test validation in single itertion, should ideally be included into the multi-iteration run
+    TESTCHECK(startColl(args, type, op, root, in_place, 0, 0));
+
+    TESTCHECK(completeColl(args));
 
-  double timeUsec = deltaSec*1.0E6;
+    TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+
+    //aggregate delta from all threads and procs
+    Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec * 1.0E6;
   char timeStr[100];
   if (timeUsec >= 10000.0) {
     sprintf(timeStr, "%7.0f", timeUsec);
@@ -642,9 +780,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     sprintf(timeStr, "%7.2f", timeUsec);
   }
   if (datacheck) {
-     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+    PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
   } else {
-     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
   }
 
   args->bw[0] += busBw;
@@ -652,12 +790,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   return testSuccess;
 }
 
-void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
-  int nranks = args->nProcs*args->nGpus*args->nThreads;
-  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) {
+  int nranks = args->nProcs * args->nGpus * args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset,
+      recvInplaceOffset;
 
   count = size / wordSize(type);
-  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount,
+                                   &sendInplaceOffset, &recvInplaceOffset,
+                                   (size_t)count, (size_t)nranks);
 
   args->nbytes = paramCount * wordSize(type);
   args->sendBytes = sendCount * wordSize(type);
@@ -666,237 +807,282 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
   args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
 }
 
-testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
+                      const char *typeName, ncclRedOp_t op, const char *opName,
+                      int root) {
+  // Warm-up for large size
+  setupArgs(args->maxbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+      for (int miter = 0; miter < multi_iters; miter++) {
+        TESTCHECK(startColl(args, type, op, root, 0,
+                            iter * multi_iters + miter, miter));
+      }
+  }
+  TESTCHECK(completeColl(args));
+
+  // Warm-up for small size
+  setupArgs(args->minbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+      for (int miter = 0; miter < multi_iters; miter++) {
+        TESTCHECK(startColl(args, type, op, root, 0,
+                            iter * multi_iters + miter, miter));
+      }
+  }
+  TESTCHECK(completeColl(args));
 
   // Benchmark
-  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
-      setupArgs(size, type, args);
-      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
-      TESTCHECK(BenchTime(args, type, op, root, 0));
-      PRINT("\n");
+  for (size_t size = args->minbytes; size <= args->maxbytes;
+       size = ((args->stepfactor > 1) ? size * args->stepfactor
+                                      : size + args->stepbytes)) {
+    setupArgs(size, type, args);
+    print_line_header(max(args->sendBytes, args->expectedBytes),
+                      args->nbytes / wordSize(type), typeName, opName, root);
+    TESTCHECK(BenchTime(args, type, op, root, 0));
+    // TESTCHECK(BenchTime(args, type, op, root, 1));
+    PRINT("\n");
   }
   return testSuccess;
 }
 
-testResult_t threadRunTests(struct threadArgs* args) {
+testResult_t threadRunTests(struct threadArgs *args) {
   // OFTEST_LOG1(TEST, "Enter threadRunTests");
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
-  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
+  int gpuid = args->localRank * args->nThreads * args->nGpus +
+              args->thread * args->nGpus;
   CUDACHECK(cudaSetDevice(gpuid));
-  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype,
+                                   test_typenames[nccltype],
+                                   (ncclRedOp_t)ncclop, test_opnames[ncclop]));
   return testSuccess;
 }
 
-testResult_t threadInit(struct threadArgs* args) {
+testResult_t threadInit(struct threadArgs *args) {
   // OFTEST_LOG1(TEST, "Enter threadInit");
   char hostname[1024];
   getHostName(hostname, 1024);
-  int nranks =  args->nProcs*args->nThreads*args->nGpus;
+  int nranks = args->nProcs * args->nThreads * args->nGpus;
 
-  //set main thread again
+  // set main thread again
   is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
 
   NCCLCHECK(ncclGroupStart());
-  for (int i=0; i<args->nGpus; i++) {
-    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+  for (int i = 0; i < args->nGpus; i++) {
+    int rank = args->proc * args->nThreads * args->nGpus +
+               args->thread * args->nGpus + i;
+    int gpuid = args->localRank * args->nThreads * args->nGpus +
+                args->thread * args->nGpus + i;
     CUDACHECK(cudaSetDevice(gpuid));
     // OFTEST_LOG1(TEST, "CommInitRank here");
-    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
+    NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank));
   }
   NCCLCHECK(ncclGroupEnd());
 
   TESTCHECK(threadRunTests(args));
 
-  for (int i=0; i<args->nGpus; i++) {
+  for (int i = 0; i < args->nGpus; i++) {
     NCCLCHECK(ncclCommDestroy(args->comms[i]));
   }
   return testSuccess;
 }
 
-void* threadLauncher(void* thread_) {
-  struct testThread* thread = (struct testThread*)thread_;
+void *threadLauncher(void *thread_) {
+  struct testThread *thread = (struct testThread *)thread_;
   thread->ret = thread->func(&thread->args);
   return NULL;
 }
-testResult_t threadLaunch(struct testThread* thread) {
+testResult_t threadLaunch(struct testThread *thread) {
   pthread_create(&thread->thread, NULL, threadLauncher, thread);
   return testSuccess;
 }
 
-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
-    CUDACHECK(cudaMalloc(sendbuff, nbytes));
-    CUDACHECK(cudaMalloc(recvbuff, nbytes));
-    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
-    return testSuccess;
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff,
+                           size_t recvBytes, void **expected, size_t nbytes,
+                           int nranks) {
+  CUDACHECK(cudaMalloc(sendbuff, nbytes));
+  CUDACHECK(cudaMalloc(recvbuff, nbytes));
+  if (datacheck)
+    CUDACHECK(cudaMalloc(expected, recvBytes));
+  return testSuccess;
 }
 
 testResult_t run(); // Main function
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
   // Make sure everyline is flushed so that we see the progress of the test
   setlinebuf(stdout);
 
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-    ncclGetVersion(&test_ncclVersion);
-  #else
-    test_ncclVersion = NCCL_VERSION_CODE;
-  #endif
-  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
-    test_opnum = 4;
-    test_typenum = 9;
-    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
-      test_opnum++; // ncclAvg
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
-        test_typenum++; // bfloat16
-      #endif
-    }
-    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
-      test_opnum++; // PreMulSum
-    }
-  #endif
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+  ncclGetVersion(&test_ncclVersion);
+#else
+  test_ncclVersion = NCCL_VERSION_CODE;
+#endif
+// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE,
+// test_ncclVersion);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0)
+  test_opnum = 4;
+  test_typenum = 9;
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 10, 0)) {
+    test_opnum++; // ncclAvg
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    test_typenum++; // bfloat16
+#endif
+  }
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 11, 0)) {
+    test_opnum++; // PreMulSum
+  }
+#endif
 
   // Parse args
   double parsed;
   int longindex;
   static struct option longopts[] = {
-    {"nthreads", required_argument, 0, 't'},
-    {"ngpus", required_argument, 0, 'g'},
-    {"minbytes", required_argument, 0, 'b'},
-    {"maxbytes", required_argument, 0, 'e'},
-    {"stepbytes", required_argument, 0, 'i'},
-    {"stepfactor", required_argument, 0, 'f'},
-    {"iters", required_argument, 0, 'n'},
-    {"agg_iters", required_argument, 0, 'm'},
-    {"warmup_iters", required_argument, 0, 'w'},
-    {"parallel_init", required_argument, 0, 'p'},
-    {"check", required_argument, 0, 'c'},
-    {"op", required_argument, 0, 'o'},
-    {"datatype", required_argument, 0, 'd'},
-    {"root", required_argument, 0, 'r'},
-    {"blocking", required_argument, 0, 'z'},
-    {"cudagraph", required_argument, 0, 'G'},
-    {"average", required_argument, 0, 'a'},
-    {"help", no_argument, 0, 'h'},
-    {}
-  };
-
-  while(1) {
+      {"nthreads", required_argument, 0, 't'},
+      {"ngpus", required_argument, 0, 'g'},
+      {"minbytes", required_argument, 0, 'b'},
+      {"maxbytes", required_argument, 0, 'e'},
+      {"stepbytes", required_argument, 0, 'i'},
+      {"stepfactor", required_argument, 0, 'f'},
+      {"iters", required_argument, 0, 'n'},
+      {"agg_iters", required_argument, 0, 'm'},
+      {"multi_iters", required_argument, 0, 'M'},
+      {"warmup_iters", required_argument, 0, 'w'},
+      {"parallel_init", required_argument, 0, 'p'},
+      {"check", required_argument, 0, 'c'},
+      {"op", required_argument, 0, 'o'},
+      {"datatype", required_argument, 0, 'd'},
+      {"root", required_argument, 0, 'r'},
+      {"blocking", required_argument, 0, 'z'},
+      {"cudagraph", required_argument, 0, 'G'},
+      {"average", required_argument, 0, 'a'},
+      {"help", no_argument, 0, 'h'},
+      {}};
+
+  while (1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts,
+                    &longindex);
 
     if (c == -1)
       break;
 
-    switch(c) {
-      case 't':
-        nThreads = strtol(optarg, NULL, 0);
-        break;
-      case 'g':
-        nGpus = strtol(optarg, NULL, 0);
-        break;
-      case 'b':
-        parsed = parsesize(optarg);
-        if (parsed < 0) {
-          fprintf(stderr, "invalid size specified for 'minbytes'\n");
-          return -1;
-        }
-        minBytes = (size_t)parsed;
-        break;
-      case 'e':
-        parsed = parsesize(optarg);
-        if (parsed < 0) {
-          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
-          return -1;
-        }
-        maxBytes = (size_t)parsed;
-        break;
-      case 'i':
-        stepBytes = strtol(optarg, NULL, 0);
-        break;
-      case 'f':
-        stepFactor = strtol(optarg, NULL, 0);
-        break;
-      case 'n':
-        iters = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'm':
+    switch (c) {
+    case 't':
+      nThreads = strtol(optarg, NULL, 0);
+      break;
+    case 'g':
+      nGpus = strtol(optarg, NULL, 0);
+      break;
+    case 'b':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'minbytes'\n");
+        return -1;
+      }
+      minBytes = (size_t)parsed;
+      break;
+    case 'e':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+        return -1;
+      }
+      maxBytes = (size_t)parsed;
+      break;
+    case 'i':
+      stepBytes = strtol(optarg, NULL, 0);
+      break;
+    case 'f':
+      stepFactor = strtol(optarg, NULL, 0);
+      break;
+    case 'n':
+      iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'M':
+      multi_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'm':
 #if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
-        agg_iters = (int)strtol(optarg, NULL, 0);
+      agg_iters = (int)strtol(optarg, NULL, 0);
 #else
-        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+      fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
 #endif
-        break;
-      case 'w':
-        warmup_iters = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'c':
-        datacheck = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'p':
-        parallel_init = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'o':
-        ncclop = ncclstringtoop(optarg);
-        break;
-      case 'd':
-        nccltype = ncclstringtotype(optarg);
-        break;
-      case 'r':
-        ncclroot = strtol(optarg, NULL, 0);
-        break;
-      case 'z':
-        blocking_coll = strtol(optarg, NULL, 0);
-        break;
-      case 'G':
-#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
-        cudaGraphLaunches = strtol(optarg, NULL, 0);
+      break;
+    case 'w':
+      warmup_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'c':
+      datacheck = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'p':
+      parallel_init = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'o':
+      ncclop = ncclstringtoop(optarg);
+      break;
+    case 'd':
+      nccltype = ncclstringtotype(optarg);
+      break;
+    case 'r':
+      ncclroot = strtol(optarg, NULL, 0);
+      break;
+    case 'z':
+      blocking_coll = strtol(optarg, NULL, 0);
+      break;
+    case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) &&                \
+    CUDART_VERSION >= 11030
+      cudaGraphLaunches = strtol(optarg, NULL, 0);
 #else
-        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+      printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA "
+             "11.3. Ignoring\n");
 #endif
-        break;
-      case 'a':
-        average = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'h':
-      default:
-        if (c != 'h') printf("invalid option '%c'\n", c);
-        printf("USAGE: %s \n\t"
-            "[-t,--nthreads <num threads>] \n\t"
-            "[-g,--ngpus <gpus per thread>] \n\t"
-            "[-b,--minbytes <min size in bytes>] \n\t"
-            "[-e,--maxbytes <max size in bytes>] \n\t"
-            "[-i,--stepbytes <increment size>] \n\t"
-            "[-f,--stepfactor <increment factor>] \n\t"
-            "[-n,--iters <iteration count>] \n\t"
-            "[-m,--agg_iters <aggregated iteration count>] \n\t"
-            "[-w,--warmup_iters <warmup iteration count>] \n\t"
-            "[-p,--parallel_init <0/1>] \n\t"
-            "[-c,--check <0/1>] \n\t"
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
-#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+      break;
+    case 'a':
+      average = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'h':
+    default:
+      if (c != 'h')
+        printf("invalid option '%c'\n", c);
+      printf("USAGE: %s \n\t"
+             "[-t,--nthreads <num threads>] \n\t"
+             "[-g,--ngpus <gpus per thread>] \n\t"
+             "[-b,--minbytes <min size in bytes>] \n\t"
+             "[-e,--maxbytes <max size in bytes>] \n\t"
+             "[-i,--stepbytes <increment size>] \n\t"
+             "[-f,--stepfactor <increment factor>] \n\t"
+             "[-n,--iters <iteration count>] \n\t"
+             "[-m,--agg_iters <aggregated iteration count>] \n\t"
+             "[-M,--multi_iters <multi seprate ncclComm iteration count>] \n\t"
+             "[-w,--warmup_iters <warmup iteration count>] \n\t"
+             "[-p,--parallel_init <0/1>] \n\t"
+             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+             "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+             "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
 #else
-            "[-o,--op <sum/prod/min/max/all>] \n\t"
+             "[-o,--op <sum/prod/min/max/all>] \n\t"
 #endif
-            "[-d,--datatype <nccltype/all>] \n\t"
-            "[-r,--root <root>] \n\t"
-            "[-z,--blocking <0/1>] \n\t"
-            "[-G,--cudagraph <num graph launches>] \n\t"
-            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
-            "[-h,--help]\n",
-	    basename(argv[0]));
-	return 0;
+             "[-d,--datatype <nccltype/all>] \n\t"
+             "[-r,--root <root>] \n\t"
+             "[-z,--blocking <0/1>] \n\t"
+             "[-G,--cudagraph <num graph launches>] \n\t"
+             "[-a,--average <0/1/2/3> report average iteration time "
+             "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+             "[-h,--help]\n",
+             basename(argv[0]));
+      return 0;
     }
   }
   if (minBytes > maxBytes) {
-    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
-           (unsigned long long)minBytes,
-           (unsigned long long)maxBytes);
+    fprintf(stderr,
+            "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+            (unsigned long long)minBytes, (unsigned long long)maxBytes);
     return -1;
   }
 #ifdef MPI_SUPPORT
@@ -917,18 +1103,28 @@ testResult_t run() {
   MPI_Comm_rank(MPI_COMM_WORLD, &proc);
   uint64_t hostHashs[nProcs];
   hostHashs[proc] = getHostHash(hostname);
-  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
-  for (int p=0; p<nProcs; p++) {
-    if (p == proc) break;
-    if (hostHashs[p] == hostHashs[proc]) localRank++;
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t),
+                MPI_BYTE, MPI_COMM_WORLD);
+  for (int p = 0; p < nProcs; p++) {
+    if (p == proc)
+      break;
+    if (hostHashs[p] == hostHashs[proc])
+      localRank++;
   }
 #endif
   is_main_thread = (proc == 0) ? 1 : 0;
 
-  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
-      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
-  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
-  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup "
+        "iters: %d iters: %d validation: %d \n",
+        nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1) ? stepFactor : stepBytes,
+        (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll)
+    PRINT("# Blocking Enabled: wait for completion and barrier after each "
+          "collective \n");
+  if (parallel_init)
+    PRINT("# Parallel Init Enabled: threads call into NcclInitRank "
+          "concurrently \n");
   PRINT("#\n");
 
   PRINT("# Using devices\n");
@@ -936,23 +1132,26 @@ testResult_t run() {
   char line[MAX_LINE];
   int len = 0;
   size_t maxMem = ~0;
-  for (int i=0; i<nThreads*nGpus; i++) {
-    int cudaDev = localRank*nThreads*nGpus+i;
-    int rank = proc*nThreads*nGpus+i;
+  for (int i = 0; i < nThreads * nGpus; i++) {
+    int cudaDev = localRank * nThreads * nGpus + i;
+    int rank = proc * nThreads * nGpus + i;
     cudaDeviceProp prop;
     CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
-                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    len +=
+        snprintf(line + len, MAX_LINE - len,
+                 "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank,
+                 getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
     maxMem = std::min(maxMem, prop.totalGlobalMem);
   }
 
 #if MPI_SUPPORT
-  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+  char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL;
   // Gather all output in rank order to root (0)
-  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0,
+             MPI_COMM_WORLD);
   if (proc == 0) {
     for (int p = 0; p < nProcs; p++)
-      PRINT("%s", lines+MAX_LINE*p);
+      PRINT("%s", lines + MAX_LINE * p);
     free(lines);
   }
   MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
@@ -960,11 +1159,14 @@ testResult_t run() {
   PRINT("%s", line);
 #endif
 
-  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
-  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for
+  // the rest.
+  size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2);
   if (maxBytes > memMaxBytes) {
     maxBytes = memMaxBytes;
-    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+    if (proc == 0)
+      printf("#\n# Reducing maxBytes to %ld due to memory limitation\n",
+             maxBytes);
   }
 
   ncclUniqueId ncclId;
@@ -975,45 +1177,73 @@ testResult_t run() {
   MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
-  cudaStream_t streams[nGpus*nThreads];
-  void* sendbuffs[nGpus*nThreads];
-  void* recvbuffs[nGpus*nThreads];
-  void* expected[nGpus*nThreads];
+  cudaStream_t streams[nGpus * nThreads];
+  void *sendbuffs[nGpus * nThreads];
+  void *recvbuffs[nGpus * nThreads];
+  void *expected[nGpus * nThreads];
   size_t sendBytes, recvBytes;
 
-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes,
+                             (size_t)nProcs * nGpus * nThreads);
 
-  for (int i=0; i<nGpus*nThreads; i++) {
-    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
-    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+    TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes,
+                            expected + i, (size_t)maxBytes,
+                            nProcs * nThreads * nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking));
   }
 
-  //if parallel init is not selected, use main thread to initialize NCCL
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+  // if parallel init is not selected, use main thread to initialize NCCL
+  ncclComm_t *comms =
+      (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  ncclComm_t *adjusted_comms =
+    (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
   if (!parallel_init) {
-     if (nProcs == 1) {
-       int gpuArray[nGpus*nThreads];
-       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
-       OFTEST_LOG1(TEST, "CommInitAll here");
-       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
-     } else {
-       NCCLCHECK(ncclGroupStart());
-       for (int i=0; i<nGpus*nThreads; i++) {
-         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+    if (nProcs == 1) {
+      int gpuArray[nGpus * nThreads];
+      for (int i = 0; i < nGpus * nThreads; i++)
+        gpuArray[i] = i;
+      OFTEST_LOG1(TEST, "CommInitAll here");
+      // use seprate comm
+      // TODO: we do not support MPI now.
+      for (int miter = 0; miter < multi_iters; miter++) {
+        NCCLCHECK(
+          ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray));
+        for (int tid = 0; tid < nThreads; tid++) {
+          memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus);
+        }
+      }
+      
+      // for (int miter = 0; miter < multi_iters; miter++) {
+      //   for (int tid = 0; tid < nThreads; tid++) {
+      //       OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus);
+      //   }
+      // }
+      // for (int tid = 0; tid < nThreads; tid++) {
+      //   for (int miter = 0; miter < multi_iters; miter++) {
+      //     OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus);
+      //   }
+      // }
+    } else {
+      NCCLCHECK(ncclGroupStart());
+      for (int i = 0; i < nGpus * nThreads; i++) {
+        CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
         //  OFTEST_LOG1(TEST, "CommInitRank here");
-         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
-       }
-       NCCLCHECK(ncclGroupEnd());
-     }
+        NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId,
+                                   proc * nThreads * nGpus + i));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    }
   }
 
   int errors[nThreads];
   double bw[nThreads];
-  double* delta;
-  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
+  double *delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS,
+                          cudaHostAllocPortable | cudaHostAllocMapped));
   int bw_count[nThreads];
-  for (int t=0; t<nThreads; t++) {
+  for (int t = 0; t < nThreads; t++) {
     bw[t] = 0.0;
     errors[t] = bw_count[t] = 0;
   }
@@ -1021,54 +1251,59 @@ testResult_t run() {
   PRINT("#\n");
   print_header();
 
-  int* sync = (int*)calloc(2, sizeof(int));
-  int* barrier = (int*)calloc(2, sizeof(int));
-  double* reduce = (double*)calloc(2, sizeof(double));
+  int *sync = (int *)calloc(2, sizeof(int));
+  int *barrier = (int *)calloc(2, sizeof(int));
+  double *reduce = (double *)calloc(2, sizeof(double));
 
   struct testThread threads[nThreads];
-  memset(threads, 0, sizeof(struct testThread)*nThreads);
+  memset(threads, 0, sizeof(struct testThread) * nThreads);
 
-  for (int t=nThreads-1; t>=0; t--) {
-    threads[t].args.minbytes=minBytes;
-    threads[t].args.maxbytes=maxBytes;
-    threads[t].args.stepbytes=stepBytes;
-    threads[t].args.stepfactor=stepFactor;
+  for (int t = nThreads - 1; t >= 0; t--) {
+    threads[t].args.minbytes = minBytes;
+    threads[t].args.maxbytes = maxBytes;
+    threads[t].args.stepbytes = stepBytes;
+    threads[t].args.stepfactor = stepFactor;
     threads[t].args.localRank = localRank;
 
-    threads[t].args.nProcs=nProcs;
-    threads[t].args.proc=proc;
-    threads[t].args.nThreads=nThreads;
-    threads[t].args.thread=t;
-    threads[t].args.nGpus=nGpus;
-    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
-    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
-    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.nProcs = nProcs;
+    threads[t].args.proc = proc;
+    threads[t].args.nThreads = nThreads;
+    threads[t].args.thread = t;
+    threads[t].args.nGpus = nGpus;
+    threads[t].args.sendbuffs = sendbuffs + t * nGpus;
+    threads[t].args.recvbuffs = recvbuffs + t * nGpus;
+    threads[t].args.expected = expected + t * nGpus;
     threads[t].args.ncclId = ncclId;
-    threads[t].args.comms=comms+t*nGpus;
-    threads[t].args.streams=streams+t*nGpus;
+    threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus;
+    // for (int i = 0; i < multi_iters * nGpus; i++) {
+    //   OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i);
+    // }
 
-    threads[t].args.barrier = (volatile int*)barrier;
+    threads[t].args.streams = streams + t * nGpus;
+
+    threads[t].args.barrier = (volatile int *)barrier;
     threads[t].args.barrier_idx = 0;
-    threads[t].args.reduce = (volatile double*)reduce;
-    threads[t].args.sync = (volatile int*)sync;
+    threads[t].args.reduce = (volatile double *)reduce;
+    threads[t].args.sync = (volatile int *)sync;
     threads[t].args.sync_idx = 0;
-    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
-    threads[t].args.errors=errors+t;
-    threads[t].args.bw=bw+t;
-    threads[t].args.bw_count=bw_count+t;
+    threads[t].args.deltaHost = (delta + t * NUM_BLOCKS);
+    threads[t].args.errors = errors + t;
+    threads[t].args.bw = bw + t;
+    threads[t].args.bw_count = bw_count + t;
 
     threads[t].args.reportErrors = 1;
 
     threads[t].func = parallel_init ? threadInit : threadRunTests;
     if (t)
-      TESTCHECK(threadLaunch(threads+t));
+      TESTCHECK(threadLaunch(threads + t));
     else
       TESTCHECK(threads[t].func(&threads[t].args));
   }
 
   // Wait for other threads and accumulate stats and errors
-  for (int t=nThreads-1; t>=0; t--) {
-    if (t) pthread_join(threads[t].thread, NULL);
+  for (int t = nThreads - 1; t >= 0; t--) {
+    if (t)
+      pthread_join(threads[t].thread, NULL);
     TESTCHECK(threads[t].ret);
     if (t) {
       errors[0] += errors[t];
@@ -1082,25 +1317,31 @@ testResult_t run() {
 #endif
 
   if (!parallel_init) {
-    for(int i=0; i<nGpus*nThreads; ++i)
+    for (int i = 0; i < nGpus * nThreads; ++i)
       NCCLCHECK(ncclCommDestroy(comms[i]));
     free(comms);
   }
 
   // Free off CUDA allocated memory
-  for (int i=0; i<nGpus*nThreads; i++) {
-    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
-    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
-    if (datacheck) CUDACHECK(cudaFree(expected[i]));
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    if (sendbuffs[i])
+      CUDACHECK(cudaFree((char *)sendbuffs[i]));
+    if (recvbuffs[i])
+      CUDACHECK(cudaFree((char *)recvbuffs[i]));
+    if (datacheck)
+      CUDACHECK(cudaFree(expected[i]));
   }
   CUDACHECK(cudaFreeHost(delta));
 
-  char* str = getenv("NCCL_TESTS_MIN_BW");
+  char *str = getenv("NCCL_TESTS_MIN_BW");
   double check_avg_bw = str ? atof(str) : -1;
   bw[0] /= bw_count[0];
 
-  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("# Out of bounds values : %d %s\n", errors[0],
+        errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0],
+        check_avg_bw == -1 ? ""
+                           : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK"));
   PRINT("#\n");
 #ifdef MPI_SUPPORT
   MPI_Finalize();
@@ -1109,7 +1350,7 @@ testResult_t run() {
   // 'cuda-memcheck --leak-check full' requires this
   cudaDeviceReset();
 
-  if (errors[0] || bw[0] < check_avg_bw*(0.9))
+  if (errors[0] || bw[0] < check_avg_bw * (0.9))
     exit(EXIT_FAILURE);
   else
     exit(EXIT_SUCCESS);
diff --git a/src_simple/common_simple.cu.simple b/src_simple/common_simple.cu.simple
index e3b644c..8dfc98a 100644
--- a/src_simple/common_simple.cu.simple
+++ b/src_simple/common_simple.cu.simple
@@ -682,46 +682,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       // Initialize sendbuffs, recvbuffs and expected
       TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
-// #if CUDART_VERSION >= 11030
-//       if (cudaGraphLaunches >= 1) {
-//         // Begin cuda graph capture for data check
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
-//         }
-//       }
-// #endif
-
       //test validation in single itertion, should ideally be included into the multi-iteration run
-      // TESTCHECK(startColl(args, type, op, root, in_place, 0));
-
-// #if CUDART_VERSION >= 11030
-//       if (cudaGraphLaunches >= 1) {
-//         // End cuda graph capture
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
-//         }
-//         // Instantiate cuda graph
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
-//         }
-//         // Launch cuda graph
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
-//         }
-//       }
-// #endif
-
-      // TESTCHECK(completeColl(args));
+      TESTCHECK(startColl(args, type, op, root, in_place, 0));
 
-// #if CUDART_VERSION >= 11030
-//       if (cudaGraphLaunches >= 1) {
-//         //destroy cuda graph
-//         for (int i=0; i<args->nGpus; i++) {
-//           CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
-//           CUDACHECK(cudaGraphDestroy(graphs[i]));
-//         }
-//       }
-// #endif
+      TESTCHECK(completeColl(args));
 
       TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
 

From 76b1cd7ed62694b63a879637f81f2d20d8f42ce6 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Tue, 19 Jul 2022 18:16:08 +0000
Subject: [PATCH 013/109] remove log

---
 src_simple/common_simple.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index bb64ebc..706db22 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -600,7 +600,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
   }
   for (int i = 0; i < args->nGpus; i++) {
     ncclComm_t comm = args->comms[miter * nGpus + i];
-    OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm);
+    // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm);
 #ifndef NCCL_MAJOR
     int cudaDev;
     NCCLCHECK(ncclCommCuDevice(comm, &cudaDev));

From a466914a2f9e034a3ff25e33ace984117d0feb4a Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 28 Jul 2022 16:50:23 +0000
Subject: [PATCH 014/109] use prepare and done in nccl-tests

---
 src_simple/common_simple.cu    | 108 ++++++++++++++++++++++++++++++++-
 src_simple/common_simple.h     |   9 +--
 src_simple/ofccl_all_reduce.cu |  16 +++--
 3 files changed, 122 insertions(+), 11 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 706db22..e34c2b1 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -584,6 +584,94 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams,
   return testSuccess;
 }
 
+testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) {
+  size_t count = args->nbytes / wordSize(type);
+  if (args->nGpus != 1) {
+    OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs");
+    return testInternalError;
+  }
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
+
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    ncclRedOp_t op;
+    
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  
+  return testSuccess;
+}
+
 testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
                        ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) {
   size_t count = args->nbytes / wordSize(type);
@@ -595,7 +683,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
   size_t shift = totalnbytes * (iter % steps);
 
   if (args->nGpus > 1) {
-    printf("\nstartColl, args->nGpus > 1 run ncclGroupStart\n");
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart");
     NCCLCHECK(ncclGroupStart());
   }
   for (int i = 0; i < args->nGpus; i++) {
@@ -684,7 +772,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
 #endif
   }
   if (args->nGpus > 1) {
-    printf("\nstartColl, args->nGpus > 1 run ncclGroupEnd\n");
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd");
     NCCLCHECK(ncclGroupEnd());
   }
 
@@ -809,7 +897,21 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) {
 
 testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
                       const char *typeName, ncclRedOp_t op, const char *opName,
-                      int root) {
+                      int root, bool is_ofccl) {
+  if (is_ofccl) {
+    // prepare for all size. op, type traversed in the caller.
+    for (size_t size = args->minbytes; size <= args->maxbytes;
+        size = ((args->stepfactor > 1) ? size * args->stepfactor
+                                        : size + args->stepbytes)) {
+      setupArgs(size, type, args);
+      for (int miter = 0; miter < multi_iters; miter++) {
+        TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter));
+      }
+    }
+
+    ofcclPrepareDone();
+  }
+
   // Warm-up for large size
   setupArgs(args->maxbytes, type, args);
   for (int iter = 0; iter < warmup_iters; iter++) {
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index caaafef..b5c85a1 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -16,9 +16,9 @@
 #include <pthread.h>
 #include "nccl1_compat.h"
 
-#define OFTEST_LOG(PRE, FMT, args...) printf("\nTEST [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__, args)
-#define OFTEST_LOG1(PRE, FMT) printf("\nTEST [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__)
-#define OFTEST_LOG0(PRE) printf("\nTEST [%s:%d] <%s> " #PRE, __FILE__, __LINE__, __func__)
+#define OFTEST_LOG(PRE, FMT, args...) printf("\n(testlog) [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__, args)
+#define OFTEST_LOG1(PRE, FMT) printf("\n(testlog) [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__)
+#define OFTEST_LOG0(PRE) printf("\n(testlog) [%s:%d] <%s> " #PRE, __FILE__, __LINE__, __func__)
 
 #define CUDACHECK(cmd) do {                         \
   cudaError_t err = cmd;                            \
@@ -75,6 +75,7 @@ struct testColl {
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
   testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
       ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
@@ -144,7 +145,7 @@ struct testThread {
 
 // Provided by common.cu
 extern void Barrier(struct threadArgs* args);
-extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root, bool is_ofccl=false);
 extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
 extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
 extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 62f8b69..cda3f34 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -57,8 +57,15 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
 
 testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
 
-  NCCLCHECK(ofcclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> ofccl_nccl_test invoke ofcclAllReduce\n", getpid());
+  // NCCLCHECK(ofcclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  // OFTEST_LOG1(TEST, "UNIMPLEMENTED ofcclAllReduce");
+  return testSuccess;
+}
+
+testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId) {
+
+  NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId));
+  // OFTEST_LOG(TEST, "invoke ofcclPrepareAllReduce with count=%lu, collId=%d", count, collId);
   return testSuccess;
 }
 
@@ -67,7 +74,8 @@ struct testColl allReduceTest = {
   AllReduceGetCollByteCount,
   AllReduceInitData,
   AllReduceGetBw,
-  AllReduceRunColl
+  AllReduceRunColl,
+  AllReducePrepare
 };
 
 void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
@@ -104,7 +112,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
 
   for (int i=0; i<type_count; i++) {
     for (int j=0; j<op_count; j++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
     }
   }
   return testSuccess;

From 9b362b5a87219c4ecc1a378eb6e4d4880c38464b Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Tue, 9 Aug 2022 07:00:56 +0000
Subject: [PATCH 015/109] check no reused ncclComm in ofcclCommList

---
 src_simple/common_simple.cu    | 1 +
 src_simple/ofccl_all_reduce.cu | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index e34c2b1..44ffe21 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -1297,6 +1297,7 @@ testResult_t run() {
   }
 
   // if parallel init is not selected, use main thread to initialize NCCL
+  // TODO: assign more comms when use multi size.
   ncclComm_t *comms =
       (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
   ncclComm_t *adjusted_comms =
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index cda3f34..4d9af93 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -8,6 +8,7 @@
 #include "common_simple.h"
 #include <stdio.h>
 #include <unistd.h>
+#include <pthread.h>
 
 void print_header() {
   PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
@@ -65,7 +66,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
 testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId) {
 
   NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId));
-  // OFTEST_LOG(TEST, "invoke ofcclPrepareAllReduce with count=%lu, collId=%d", count, collId);
+  OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
   return testSuccess;
 }
 

From 818c8e3fa0651757f490668f64f63056761cd54b Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Tue, 9 Aug 2022 08:17:19 +0000
Subject: [PATCH 016/109] invoke ofcclDestroy

---
 src_simple/common_simple.cu    | 10 +++++++++-
 src_simple/ofccl_all_reduce.cu |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 44ffe21..2320dc9 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -900,6 +900,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
                       int root, bool is_ofccl) {
   if (is_ofccl) {
     // prepare for all size. op, type traversed in the caller.
+    // TODO: if we support multi size, each size should use a separate ncclComm
     for (size_t size = args->minbytes; size <= args->maxbytes;
         size = ((args->stepfactor > 1) ? size * args->stepfactor
                                         : size + args->stepbytes)) {
@@ -912,6 +913,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
     ofcclPrepareDone();
   }
 
+  // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
   // Warm-up for large size
   setupArgs(args->maxbytes, type, args);
   for (int iter = 0; iter < warmup_iters; iter++) {
@@ -943,6 +945,12 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
     // TESTCHECK(BenchTime(args, type, op, root, 1));
     PRINT("\n");
   }
+
+  if (is_ofccl) {
+    // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self());
+    ofcclDestroy();
+  }
+
   return testSuccess;
 }
 
@@ -1307,7 +1315,7 @@ testResult_t run() {
       int gpuArray[nGpus * nThreads];
       for (int i = 0; i < nGpus * nThreads; i++)
         gpuArray[i] = i;
-      OFTEST_LOG1(TEST, "CommInitAll here");
+      // OFTEST_LOG1(TEST, "CommInitAll here");
       // use seprate comm
       // TODO: we do not support MPI now.
       for (int miter = 0; miter < multi_iters; miter++) {
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 4d9af93..714365d 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -66,7 +66,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
 testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId) {
 
   NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId));
-  OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
   return testSuccess;
 }
 

From 67e70b93f9cf73ec227f8188a42b1f0060970ecc Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Tue, 23 Aug 2022 03:15:33 +0000
Subject: [PATCH 017/109] use ofcclRunAllReduce

---
 .gitignore                     | 4 +++-
 src_simple/Makefile            | 3 ++-
 src_simple/common_simple.cu    | 3 +--
 src_simple/common_simple.h     | 3 +--
 src_simple/ofccl_all_reduce.cu | 4 ++--
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index c908b05..b0853be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,6 @@
 # See LICENCE.txt for license information
 /build
 
-.clangd
\ No newline at end of file
+.clangd
+
+.vscode
\ No newline at end of file
diff --git a/src_simple/Makefile b/src_simple/Makefile
index 3247401..de282de 100644
--- a/src_simple/Makefile
+++ b/src_simple/Makefile
@@ -75,7 +75,8 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce_group all_reduce_simple ofccl_all_reduce
+# BIN_FILES_LIST := all_reduce_group all_reduce_simple ofccl_all_reduce
+BIN_FILES_LIST := ofccl_all_reduce
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 2320dc9..b03fbd9 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -762,8 +762,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
         (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank
                           : sendBuff),
         (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank
-                          : recvBuff),
-        count, type, op, root, comm, args->streams[i]));
+                          : recvBuff), miter));
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
     if (opIndex >= ncclNumOps) {
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index b5c85a1..1fb299d 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -73,8 +73,7 @@ struct testColl {
   testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
-  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId);
   testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId);
 };
 extern struct testColl allReduceTest;
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 714365d..63744ae 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -56,9 +56,9 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId) {
 
-  // NCCLCHECK(ofcclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId));
   // OFTEST_LOG1(TEST, "UNIMPLEMENTED ofcclAllReduce");
   return testSuccess;
 }

From 97f58bcbb70611c3b8caba914b3c5b295087802c Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 26 Aug 2022 06:56:24 +0000
Subject: [PATCH 018/109] use callback

---
 src_simple/common_simple.h     |  6 +++---
 src_simple/ofccl_all_reduce.cu | 23 +++++++++++++++++++++--
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index 1fb299d..9b82e5a 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -16,9 +16,9 @@
 #include <pthread.h>
 #include "nccl1_compat.h"
 
-#define OFTEST_LOG(PRE, FMT, args...) printf("\n(testlog) [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__, args)
-#define OFTEST_LOG1(PRE, FMT) printf("\n(testlog) [%s:%d] <%s> " #PRE " " FMT, __FILE__, __LINE__, __func__)
-#define OFTEST_LOG0(PRE) printf("\n(testlog) [%s:%d] <%s> " #PRE, __FILE__, __LINE__, __func__)
+#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
+#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
+#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__)
 
 #define CUDACHECK(cmd) do {                         \
   cudaError_t err = cmd;                            \
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 63744ae..a764338 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -9,6 +9,7 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <pthread.h>
+#include <sched.h>
 
 void print_header() {
   PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
@@ -57,9 +58,27 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
 }
 
 testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId) {
+  int gotCqe = 0;
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  auto callback = [&](int collIdFromCqe){
+    if (collId != collIdFromCqe) {
+      // TODO: more robust error handle.
+      OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+      return -1;
+    }
+    gotCqe = 1;
+    OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+    return 0;
+  };
+
+  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, callback));
 
-  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId));
-  // OFTEST_LOG1(TEST, "UNIMPLEMENTED ofcclAllReduce");
+  // TODO: 这会损害带宽测量的结果，之后在common_simple.cu里搞个数组，统一等待。
+  while(gotCqe == 0) {
+    sched_yield();
+  }
+  
   return testSuccess;
 }
 

From 8a3d5f88511063db09d10c3d48baf229ceb75c5c Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 26 Aug 2022 09:47:13 +0000
Subject: [PATCH 019/109] use func-ptr for callback, instead of std::function
 and lambda

---
 src_simple/common_simple.h     |  5 +++++
 src_simple/ofccl_all_reduce.cu | 34 +++++++++++++++++++++-------------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index 9b82e5a..82a581c 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -64,6 +64,11 @@ typedef enum {
   }                                                 \
 } while(0)
 
+typedef struct {
+  int collId;
+  int gotCqe;
+} CallBackArgs;
+
 struct testColl {
   const char name[20];
   void (*getCollByteCount)(
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index a764338..ed80fa3 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -57,25 +57,33 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
+int myCallback(int collIdFromCqe, void *args) {
+  // TODO: 不打log把这里删了
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  int collId = ((CallBackArgs *)args)->collId;
+  if (collId != collIdFromCqe) {
+    // TODO: more robust error handle.
+    OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+    return -1;
+  }
+  ((CallBackArgs *)args)->gotCqe = 1;
+  OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+  return 0;
+}
+
 testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId) {
-  int gotCqe = 0;
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
-  auto callback = [&](int collIdFromCqe){
-    if (collId != collIdFromCqe) {
-      // TODO: more robust error handle.
-      OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
-      return -1;
-    }
-    gotCqe = 1;
-    OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
-    return 0;
-  };
 
-  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, callback));
+  CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
+  args->collId = collId;
+  args->gotCqe = 0;
+
+  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args));
 
   // TODO: 这会损害带宽测量的结果，之后在common_simple.cu里搞个数组，统一等待。
-  while(gotCqe == 0) {
+  while(args->gotCqe == 0) {
     sched_yield();
   }
   

From a3a1aea2cd278e80784470b61aa8a679c7bf537e Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 28 Aug 2022 10:22:01 +0000
Subject: [PATCH 020/109] stuck

---
 src_simple/common_simple.cu    | 21 ++++++++++++++++++---
 src_simple/common_simple.h     |  4 +++-
 src_simple/ofccl_all_reduce.cu | 11 +++--------
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index b03fbd9..d123f54 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -100,6 +100,9 @@ static int average = 1;
 
 #define NUM_BLOCKS 32
 
+static thread_local CallBackArgs cbArgList[MAX_COLL_NUM];
+static thread_local int seenCqe[MAX_COLL_NUM];
+
 static double parsesize(const char *value) {
   long long int units;
   double size;
@@ -757,12 +760,12 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
           &op, &u64, type, ncclScalarHostImmediate, comm));
     }
 #endif
-
+    // miter就是collId。
     TESTCHECK(args->collTest->runColl(
         (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank
                           : sendBuff),
         (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank
-                          : recvBuff), miter));
+                          : recvBuff), miter, cbArgList + miter));
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
     if (opIndex >= ncclNumOps) {
@@ -787,8 +790,20 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
 testResult_t completeColl(struct threadArgs *args) {
   if (blocking_coll)
     return testSuccess;
+  
+  int gotCqeCnt = 0;
+  while (gotCqeCnt < multi_iters) {
+    for (int i = 0; i < multi_iters; i++) {
+      if (cbArgList[i].gotCqe == 1) {
+        if (seenCqe[i] == 0) {
+          gotCqeCnt++;
+          seenCqe[i] = 1;
+        }
+      }
+    }
+  }
 
-  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  // TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
   return testSuccess;
 }
 
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index 82a581c..c8e94e6 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -69,6 +69,8 @@ typedef struct {
   int gotCqe;
 } CallBackArgs;
 
+#define MAX_COLL_NUM 10000
+
 struct testColl {
   const char name[20];
   void (*getCollByteCount)(
@@ -78,7 +80,7 @@ struct testColl {
   testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
-  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args);
   testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId);
 };
 extern struct testColl allReduceTest;
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index ed80fa3..0f9fef2 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -58,7 +58,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
 }
 
 int myCallback(int collIdFromCqe, void *args) {
-  // TODO: 不打log把这里删了
+  // TODO: 不打log把这里删了，不然影响性能。
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
   int collId = ((CallBackArgs *)args)->collId;
@@ -72,20 +72,15 @@ int myCallback(int collIdFromCqe, void *args) {
   return 0;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args) {
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
 
-  CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
+  // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
   args->collId = collId;
   args->gotCqe = 0;
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args));
-
-  // TODO: 这会损害带宽测量的结果，之后在common_simple.cu里搞个数组，统一等待。
-  while(args->gotCqe == 0) {
-    sched_yield();
-  }
   
   return testSuccess;
 }

From 7ff3ea5deeb045105c226767557385b27c0812e8 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 28 Aug 2022 10:45:21 +0000
Subject: [PATCH 021/109] completeColl in warmup result in stuck

---
 src_simple/common_simple.cu    | 37 +++++++++++++++++-----------------
 src_simple/ofccl_all_reduce.cu |  1 +
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index d123f54..931fd6e 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -928,25 +928,26 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
   }
 
   // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
+  // TODO: 同时如果要warmup的话，也要准备相应的callbackArgs。比较麻烦；可以考虑对比实验的时候，nccl和ofccl都不开warmup。
   // Warm-up for large size
-  setupArgs(args->maxbytes, type, args);
-  for (int iter = 0; iter < warmup_iters; iter++) {
-      for (int miter = 0; miter < multi_iters; miter++) {
-        TESTCHECK(startColl(args, type, op, root, 0,
-                            iter * multi_iters + miter, miter));
-      }
-  }
-  TESTCHECK(completeColl(args));
-
-  // Warm-up for small size
-  setupArgs(args->minbytes, type, args);
-  for (int iter = 0; iter < warmup_iters; iter++) {
-      for (int miter = 0; miter < multi_iters; miter++) {
-        TESTCHECK(startColl(args, type, op, root, 0,
-                            iter * multi_iters + miter, miter));
-      }
-  }
-  TESTCHECK(completeColl(args));
+  // setupArgs(args->maxbytes, type, args);
+  // for (int iter = 0; iter < warmup_iters; iter++) {
+  //     for (int miter = 0; miter < multi_iters; miter++) {
+  //       TESTCHECK(startColl(args, type, op, root, 0,
+  //                           iter * multi_iters + miter, miter));
+  //     }
+  // }
+  // TESTCHECK(completeColl(args));
+
+  // // Warm-up for small size
+  // setupArgs(args->minbytes, type, args);
+  // for (int iter = 0; iter < warmup_iters; iter++) {
+  //     for (int miter = 0; miter < multi_iters; miter++) {
+  //       TESTCHECK(startColl(args, type, op, root, 0,
+  //                           iter * multi_iters + miter, miter));
+  //     }
+  // }
+  // TESTCHECK(completeColl(args));
 
   // Benchmark
   for (size_t size = args->minbytes; size <= args->maxbytes;
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 0f9fef2..0b6aacf 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -81,6 +81,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   args->gotCqe = 0;
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args));
+  OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
   
   return testSuccess;
 }

From ee76beb6f3b204b97bf4e5d2b82530ef449bf111 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 28 Aug 2022 12:38:09 +0000
Subject: [PATCH 022/109] +lock

---
 src_simple/common_simple.cu    | 14 ++++++++++++--
 src_simple/common_simple.h     |  1 +
 src_simple/ofccl_all_reduce.cu |  3 +++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 931fd6e..5f3aadc 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -790,25 +790,35 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
 testResult_t completeColl(struct threadArgs *args) {
   if (blocking_coll)
     return testSuccess;
+    
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
   
   int gotCqeCnt = 0;
   while (gotCqeCnt < multi_iters) {
     for (int i = 0; i < multi_iters; i++) {
+      pthread_mutex_lock(&cbArgList[i].mutex);
       if (cbArgList[i].gotCqe == 1) {
         if (seenCqe[i] == 0) {
           gotCqeCnt++;
           seenCqe[i] = 1;
         }
       }
+      pthread_mutex_unlock(&cbArgList[i].mutex);
     }
+    // OFTEST_LOG(TEST, "<%lu> rank=%d, completeColl gotCqeCnt = %d", pthread_self(), cudaDev, gotCqeCnt);
   }
 
   // TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
   return testSuccess;
 }
 
-testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type,
-                       ncclRedOp_t op, int root, int in_place) {
+testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+  
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
+
   size_t count = args->nbytes / wordSize(type);
   if (datacheck) {
     // Initialize sendbuffs, recvbuffs and expected
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index c8e94e6..bf2d0fd 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -67,6 +67,7 @@ typedef enum {
 typedef struct {
   int collId;
   int gotCqe;
+  pthread_mutex_t mutex;
 } CallBackArgs;
 
 #define MAX_COLL_NUM 10000
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 0b6aacf..b022b98 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -67,7 +67,9 @@ int myCallback(int collIdFromCqe, void *args) {
     OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
     return -1;
   }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
   ((CallBackArgs *)args)->gotCqe = 1;
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
   OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
   return 0;
 }
@@ -79,6 +81,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
   args->collId = collId;
   args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args));
   OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);

From 3f0a8fea86fd275a0e1748b71207848ede52d95b Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 29 Aug 2022 09:07:35 +0000
Subject: [PATCH 023/109] tidy log

---
 src_simple/common_simple.cu    | 8 ++++----
 src_simple/ofccl_all_reduce.cu | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 5f3aadc..137288c 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -814,10 +814,6 @@ testResult_t completeColl(struct threadArgs *args) {
 }
 
 testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
-  
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
 
   size_t count = args->nbytes / wordSize(type);
   if (datacheck) {
@@ -1263,6 +1259,10 @@ testResult_t run() {
   PRINT("#\n");
 
   PRINT("# Using devices\n");
+  
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
 #define MAX_LINE 2048
   char line[MAX_LINE];
   int len = 0;
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index b022b98..2b336d4 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -70,7 +70,7 @@ int myCallback(int collIdFromCqe, void *args) {
   pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
   ((CallBackArgs *)args)->gotCqe = 1;
   pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
-  OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+  // OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
   return 0;
 }
 
@@ -84,7 +84,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   pthread_mutex_init(&args->mutex, NULL);
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args));
-  OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
   
   return testSuccess;
 }

From 0bd6d6aa75e4e963085265f80bf6f3843d2ae90a Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 5 Sep 2022 01:55:48 +0000
Subject: [PATCH 024/109] nccl-tests run exactly once

---
 src/common.cu | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 05f814d..72857cd 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -596,8 +596,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   }
 
   // Sync
-  TESTCHECK(startColl(args, type, op, root, in_place, 0));
-  TESTCHECK(completeColl(args));
+  // TODO: 之后恢复？
+  // TESTCHECK(startColl(args, type, op, root, in_place, 0));
+  // TESTCHECK(completeColl(args));
 
   Barrier(args);
 
@@ -777,7 +778,8 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
       setupArgs(size, type, args);
       print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
       TESTCHECK(BenchTime(args, type, op, root, 0));
-      TESTCHECK(BenchTime(args, type, op, root, 1));
+      // TODO: 实测是否恢复？
+      // TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
   }
   return testSuccess;

From 9a35e7f214c5eb4e3ed89317a36c292ee0c0980c Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 8 Sep 2022 11:47:15 +0000
Subject: [PATCH 025/109] ad-hoc check

---
 src_simple/common_simple.cu        |   32 +-
 src_simple/common_simple.cu.pure   | 1216 ----------------------------
 src_simple/common_simple.cu.simple | 1186 ---------------------------
 3 files changed, 17 insertions(+), 2417 deletions(-)
 delete mode 100644 src_simple/common_simple.cu.pure
 delete mode 100644 src_simple/common_simple.cu.simple

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 137288c..063664d 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -279,7 +279,9 @@ __device__ double testValue<double>(const size_t offset, const int rep,
 template <>
 __device__ float testValue<float>(const size_t offset, const int rep,
                                   const int rank) {
-  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
+  // return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  return 1.0;
 }
 template <>
 __device__ half testValue<half>(const size_t offset, const int rep,
@@ -826,20 +828,9 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
-    if (multi_iters > 1) {
-      for (int miter = 0; miter < multi_iters; miter++) {
-        TESTCHECK(startColl(args, type, op, root, in_place,
-                            iter * multi_iters + miter, miter));
-      }
-    } else {
-      if (agg_iters > 1)
-        NCCLCHECK(ncclGroupStart());
-      for (int aiter = 0; aiter < agg_iters; aiter++) {
-        TESTCHECK(startColl(args, type, op, root, in_place,
-                            iter * agg_iters + aiter, 0));
-      }
-      if (agg_iters > 1)
-        NCCLCHECK(ncclGroupEnd());
+    for (int miter = 0; miter < multi_iters; miter++) {
+      TESTCHECK(startColl(args, type, op, root, in_place,
+                          iter * multi_iters + miter, miter));
     }
   }
 
@@ -863,6 +854,17 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   static __thread int rep = 0;
   rep++;
 
+  // IF_CHECK 如果要检查对错，把下边露出来
+  int printNum = 10;
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  float *ptr = (float *)malloc(printNum * sizeof(float));
+  cudaMemcpy(ptr, args->recvbuffs[0], printNum * sizeof(float), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < printNum; i++) {
+    OFTEST_LOG(TEST, "<%lu> rank=%d, recvbuff[%d]=%f", pthread_self(), cudaDev, i, ptr[i]);
+  }
+  free(ptr);
+
   if (datacheck) {
     // Initialize sendbuffs, recvbuffs and expected
     TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
diff --git a/src_simple/common_simple.cu.pure b/src_simple/common_simple.cu.pure
deleted file mode 100644
index c25c0e3..0000000
--- a/src_simple/common_simple.cu.pure
+++ /dev/null
@@ -1,1216 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "common_simple.h"
-#include <pthread.h>
-#include <cstdio>
-#include <getopt.h>
-#include <libgen.h>
-#include "cuda.h"
-
-int test_ncclVersion = 0; // init'd with ncclGetVersion()
-
-#if NCCL_MAJOR >= 2
-  ncclDataType_t test_types[ncclNumTypes] = {
-    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-    , ncclBfloat16
-  #endif
-  };
-  const char *test_typenames[ncclNumTypes] = {
-    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-    , "bfloat16"
-  #endif
-  };
-  int test_typenum = -1;
-
-  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
-  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-    , ncclAvg
-  #endif
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
-  #endif
-  };
-  int test_opnum = -1;
-#else
-  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
-  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
-  int test_typenum = 7;
-  const char *test_opnames[] = {"sum", "prod", "max", "min"};
-  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
-  int test_opnum = 4;
-#endif
-
-thread_local int is_main_thread = 0;
-
-// Command line parameter defaults
-static int nThreads = 1;
-static int nGpus = 1;
-static size_t minBytes = 32*1024*1024;
-static size_t maxBytes = 32*1024*1024;
-static size_t stepBytes = 1*1024*1024;
-static size_t stepFactor = 1;
-static int datacheck = 1;
-static int warmup_iters = 5;
-static int iters = 20;
-static int agg_iters = 1;
-static int ncclop = ncclSum;
-static int nccltype = ncclFloat;
-static int ncclroot = 0;
-static int parallel_init = 0;
-static int blocking_coll = 0;
-static int cudaGraphLaunches = 0;
-// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
-static int average = 1;
-
-#define NUM_BLOCKS 32
-
-static double parsesize(const char *value) {
-    long long int units;
-    double size;
-    char size_lit;
-
-    int count = sscanf(value, "%lf %1s", &size, &size_lit);
-
-    switch (count) {
-    case 2:
-      switch (size_lit) {
-      case 'G':
-      case 'g':
-        units = 1024*1024*1024;
-        break;
-      case 'M':
-      case 'm':
-        units = 1024*1024;
-        break;
-      case 'K':
-      case 'k':
-        units = 1024;
-        break;
-      default:
-        return -1.0;
-      };
-      break;
-    case 1:
-      units = 1;
-      break;
-    default:
-      return -1.0;
-    }
-
-    return size * units;
-}
-
-double DeltaMaxValue(ncclDataType_t type) {
-  switch(type) {
-    case ncclHalf: return 1e-2;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16: return 1e-2;
-#endif
-    case ncclFloat: return 1e-5;
-    case ncclDouble: return 1e-12;
-    case ncclInt:
-#if NCCL_MAJOR >= 2
-    case ncclUint8:
-    //case ncclInt32:
-    case ncclUint32:
-#endif
-    case ncclInt64:
-    case ncclUint64: return 1e-200;
-  }
-  return 1e-200;
-}
-
-template<typename T> __device__
-double absDiff(T a, T b) {
-  return fabs((double)(b - a));
-}
-
-template<> __device__
-double absDiff<half>(half a, half b) {
-  float x = __half2float(a);
-  float y = __half2float(b);
-  return fabs((double)(y-x));
-}
-
-template<typename T> __device__
-float toFloat(T a) {
-  return (float)a;
-}
-template<> __device__
-float toFloat(half a) {
-  return __half2float(a);
-}
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<> __device__
-float toFloat(__nv_bfloat16 a) {
-  return __bfloat162float(a);
-}
-#endif
-
-template<typename T, int BSIZE> __global__
-void deltaKern(void* A_, void* B_, size_t count, double* max) {
-  const T* A = (const T*)A_;
-  const T* B = (const T*)B_;
-  __shared__ double temp[BSIZE];
-  int tid = blockIdx.x*blockDim.x + threadIdx.x;
-  double locmax = 0.0;
-  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
-
-    double delta = absDiff(A[i], B[i]);
-    if( delta > locmax ) {
-      locmax = delta;
-#ifdef DEBUG_PRINT
-      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
-#endif
-    }
-  }
-
-  tid = threadIdx.x;
-  temp[tid] = locmax;
-  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
-    __syncthreads();
-    if( tid < stride )
-      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
-  }
-  __syncthreads();
-  if( threadIdx.x == 0)
-    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
-}
-
-testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
-  switch (type) {
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16:
-      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-#endif
-    case ncclHalf:
-      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclFloat:
-      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclDouble:
-      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-
-    case ncclChar:
-#if NCCL_MAJOR >= 2
-    case ncclUint8:
-#endif
-      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclInt:
-#if NCCL_MAJOR >= 2
-    case ncclUint32:
-#endif
-      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclInt64:
-    case ncclUint64:
-      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-  }
-  CUDACHECK(cudaDeviceSynchronize());
-  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
-  return testSuccess;
-}
-
-// For integer values, we use values between 0 and 255
-template<typename T>
-__device__ T testValue(const size_t offset, const int rep, const int rank) {
-  uint8_t v = (rep+rank+offset) % 256;
-  return (T)v;
-}
-
-// For floating point datatype, we use values between 0 and 1 otherwise the
-// Product operation will produce NaNs.
-template<>
-__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
-}
-template<>
-__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
-}
-template<>
-__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
-  return __float2half(testValue<float>(offset, rep, rank));
-}
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
-  return __float2bfloat16(testValue<float>(offset, rep, rank));
-}
-#endif
-
-// Operations
-template<typename T>
-__device__ T ncclOpSum(T a, T b) { return a+b; }
-template<typename T>
-__device__ T ncclOpProd(T a, T b) { return a*b; }
-template<typename T>
-__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
-template<typename T>
-__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
-
-// Definitions for half
-template<>
-__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
-template<>
-__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
-template<>
-__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
-template<>
-__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
-
-template<typename T>
-__device__ T ncclPPOpIdent(T x, int arg) { return x; }
-template<typename T>
-__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
-template<typename T>
-__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
-template<>
-__device__ half ncclPPOpMul(half x, int arg) {
-  return __float2half(__half2float(x)*float(arg));
-}
-template<>
-__device__ half ncclPPOpDiv(half x, int n) {
-  return __float2half(__half2float(x)/n);
-}
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
-  return __float2bfloat16(__bfloat162float(x)*float(arg));
-}
-template<>
-__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
-  return __float2bfloat16(__bfloat162float(x)/n);
-}
-#endif
-
-__host__ __device__ int preMulScalar(int rank) {
-  return 1 + rank%2;
-}
-
-template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
-__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
-    T val = testValue<T>(o+offset, rep, 0);
-    val = PreOp(val, preMulScalar(0));
-    for (int i=1; i<nranks; i++) {
-      T val1 = testValue<T>(o+offset, rep, i);
-      val1 = PreOp(val1, preMulScalar(i));
-      val = Op(val, val1);
-    }
-    data[o] = PostOp(val, nranks);
-  }
-}
-
-#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
-    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
-#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
-#else
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
-#endif
-
-static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
-  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  OPS(__nv_bfloat16)
-#endif
-};
-
-testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
-  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
-  return testSuccess;
-}
-
-template<typename T>
-__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
-    data[o] = testValue<T>(o, rep, rank);
-}
-
-static void* const initDataKerns[ncclNumTypes] = {
-  (void*)InitDataKernel<  int8_t>,
-  (void*)InitDataKernel< uint8_t>,
-  (void*)InitDataKernel< int32_t>,
-  (void*)InitDataKernel<uint32_t>,
-  (void*)InitDataKernel< int64_t>,
-  (void*)InitDataKernel<uint64_t>,
-  (void*)InitDataKernel<    half>,
-  (void*)InitDataKernel<   float>,
-  (void*)InitDataKernel<  double>,
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  (void*)InitDataKernel<__nv_bfloat16>
-#endif
-};
-
-template<typename T>
-testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
-  T* ptr = (T*)dest;
-  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
-  return testSuccess;
-}
-
-testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
-  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
-  return testSuccess;
-}
-
-void Barrier(struct threadArgs* args) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
-  args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-    args->barrier[args->barrier_idx] = 0;
-  } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
-  }
-  args->barrier_idx=!args->barrier_idx;
-}
-
-// Inter-thread/process barrier+allreduce
-void Allreduce(struct threadArgs* args, double* value, int average) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
-  double val = *value;
-  if (args->thread > 0) {
-    double val2 = args->reduce[args->barrier_idx];
-    if (average == 1) val += val2;
-    if (average == 2) val = std::min(val, val2);
-    if (average == 3) val = std::max(val, val2);
-  }
-  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
-  args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    if (average != 0) {
-      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
-      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
-    }
-#endif
-    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
-    args->reduce[1-args->barrier_idx] = 0;
-    args->barrier[args->barrier_idx] = 0;
-  } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
-  }
-  *value = args->reduce[args->barrier_idx];
-  args->barrier_idx=!args->barrier_idx;
-}
-
-testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
-  size_t count = args->expectedBytes/wordSize(type);
-  double maxDelta = 0.0;
-  for (int i=0; i<args->nGpus; i++) {
-    int device;
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
-    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
-    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
-    maxDelta = std::max(*(args->deltaHost), maxDelta);
-
-#ifdef DEBUG_PRINT
-    if (rank == 0) {
-       int *expectedHost = (int *)malloc(args->expectedBytes);
-       int *dataHost = (int *)malloc(args->expectedBytes);
-
-       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
-       printf("\n Expected: ");
-       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, expectedHost[j]);
-       }
-       printf("\n");
-
-       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
-       printf("\n Actual: ");
-       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, dataHost[j]);
-       }
-       printf("\n");
-       free(expectedHost);
-       free(dataHost);
-    }
-#endif
-  }
-  double nranks = args->nProcs*args->nThreads*args->nGpus;
-  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
-  *delta = maxDelta;
-  return testSuccess;
-}
-
-testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
-  cudaError_t cudaErr;
-  int remaining = ngpus;
-  int* done = (int*)malloc(sizeof(int)*ngpus);
-  memset(done, 0, sizeof(int)*ngpus);
-  while (remaining) {
-   int idle = 1;
-   for (int i=0; i<ngpus; i++) {
-     if (done[i]) continue;
-
-     cudaErr = cudaStreamQuery(streams[i]);
-     if (cudaErr == cudaSuccess) {
-       done[i] = 1;
-       remaining--;
-       idle = 0;
-       continue;
-     }
-
-     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
-
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
-       ncclResult_t ncclAsyncErr;
-       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
-       if (ncclAsyncErr != ncclSuccess) {
-         // An asynchronous error happened. Stop the operation and destroy
-         // the communicator
-         for (int i=0; i<ngpus; i++)
-           NCCLCHECK(ncclCommAbort(comms[i]));
-         // Abort the perf test
-         NCCLCHECK(ncclAsyncErr);
-       }
-     }
-#endif
-   }
-
-   // We might want to let other threads (including NCCL threads) use the CPU.
-   if (idle) pthread_yield();
-  }
-  free(done);
-  return testSuccess;
-}
-
-testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
-  size_t count = args->nbytes / wordSize(type);
-
-  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
-  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
-  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
-  size_t shift = totalnbytes * (iter % steps);
-
-  if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
-  for (int i = 0; i < args->nGpus; i++) {
-#ifndef NCCL_MAJOR
-    int cudaDev;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
-    CUDACHECK(cudaSetDevice(cudaDev));
-#endif
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
-    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
-    ncclRedOp_t op;
-
-    if(opIndex < ncclNumOps) {
-      op = opIndex;
-    }
-    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-    else {
-      union {
-        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
-        half f16; float f32; double f64;
-        #if defined(__CUDA_BF16_TYPES_EXIST__)
-        __nv_bfloat16 bf16;
-        #endif
-      };
-      int scalar = preMulScalar(rank);
-      switch(type) {
-      case ncclInt8: i8 = int8_t(scalar); break;
-      case ncclUint8: u8 = uint8_t(scalar); break;
-      case ncclInt32: i32 = int32_t(scalar); break;
-      case ncclUint32: u32 = uint32_t(scalar); break;
-      case ncclInt64: i64 = int32_t(scalar); break;
-      case ncclUint64: u64 = uint32_t(scalar); break;
-      case ncclFloat16: f16 = __float2half(float(scalar)); break;
-      case ncclFloat32: f32 = float(scalar); break;
-      case ncclFloat64: f64 = double(scalar); break;
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
-      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
-      #endif
-      }
-      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
-    }
-    #endif
-
-    TESTCHECK(args->collTest->runColl(
-          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
-          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
-        count, type, op, root, args->comms[i], args->streams[i]));
-
-    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-    if(opIndex >= ncclNumOps) {
-      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
-    }
-    #endif
-  }
-  if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd());
-
-  if (blocking_coll) {
-    // Complete op before returning
-    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
-  }
-  if (blocking_coll) Barrier(args);
-  return testSuccess;
-}
-
-testResult_t completeColl(struct threadArgs* args) {
-  if (blocking_coll) return testSuccess;
-
-  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
-  return testSuccess;
-}
-
-testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
-  size_t count = args->nbytes / wordSize(type);
-  if (datacheck) {
-    // Initialize sendbuffs, recvbuffs and expected
-    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
-  }
-
-  // Sync
-  TESTCHECK(startColl(args, type, op, root, in_place, 0));
-  TESTCHECK(completeColl(args));
-
-  Barrier(args);
-
-#if CUDART_VERSION >= 11030
-  cudaGraph_t graphs[args->nGpus];
-  cudaGraphExec_t graphExec[args->nGpus];
-  if (cudaGraphLaunches >= 1) {
-    // Begin cuda graph capture
-    for (int i=0; i<args->nGpus; i++) {
-      // Thread local mode is needed for:
-      // - Multi-thread mode
-      // - P2P pre-connect
-      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
-    }
-  }
-#endif
-
-  // Performance Benchmark
-  auto start = std::chrono::high_resolution_clock::now();
-  for (int iter = 0; iter < iters; iter++) {
-    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
-    for (int aiter = 0; aiter < agg_iters; aiter++) {
-      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
-    }
-    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
-  }
-
-#if CUDART_VERSION >= 11030
-  if (cudaGraphLaunches >= 1) {
-    // End cuda graph capture
-    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
-    }
-    // Instantiate cuda graph
-    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
-    }
-    // Resync CPU, restart timing, launch cuda graph
-    Barrier(args);
-    start = std::chrono::high_resolution_clock::now();
-    for (int l=0; l<cudaGraphLaunches; l++) {
-      for (int i=0; i<args->nGpus; i++) {
-        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
-      }
-    }
-  }
-#endif
-
-  TESTCHECK(completeColl(args));
-
-  auto delta = std::chrono::high_resolution_clock::now() - start;
-  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
-  deltaSec = deltaSec/(iters*agg_iters);
-  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
-  Allreduce(args, &deltaSec, average);
-
-#if CUDART_VERSION >= 11030
-  if (cudaGraphLaunches >= 1) {
-    //destroy cuda graph
-    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
-      CUDACHECK(cudaGraphDestroy(graphs[i]));
-    }
-  }
-#endif
-
-  double algBw, busBw;
-  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
-
-  Barrier(args);
-
-  double maxDelta = 0;
-  static __thread int rep = 0;
-  rep++;
-  if (datacheck) {
-      // Initialize sendbuffs, recvbuffs and expected
-      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
-
-#if CUDART_VERSION >= 11030
-      if (cudaGraphLaunches >= 1) {
-        // Begin cuda graph capture for data check
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
-        }
-      }
-#endif
-
-      //test validation in single itertion, should ideally be included into the multi-iteration run
-      TESTCHECK(startColl(args, type, op, root, in_place, 0));
-
-#if CUDART_VERSION >= 11030
-      if (cudaGraphLaunches >= 1) {
-        // End cuda graph capture
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
-        }
-        // Instantiate cuda graph
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
-        }
-        // Launch cuda graph
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
-        }
-      }
-#endif
-
-      TESTCHECK(completeColl(args));
-
-#if CUDART_VERSION >= 11030
-      if (cudaGraphLaunches >= 1) {
-        //destroy cuda graph
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
-          CUDACHECK(cudaGraphDestroy(graphs[i]));
-        }
-      }
-#endif
-
-      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
-
-      //aggregate delta from all threads and procs
-      Allreduce(args, &maxDelta, 3);
-  }
-
-  double timeUsec = deltaSec*1.0E6;
-  char timeStr[100];
-  if (timeUsec >= 10000.0) {
-    sprintf(timeStr, "%7.0f", timeUsec);
-  } else if (timeUsec >= 100.0) {
-    sprintf(timeStr, "%7.1f", timeUsec);
-  } else {
-    sprintf(timeStr, "%7.2f", timeUsec);
-  }
-  if (datacheck) {
-     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
-  } else {
-     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
-  }
-
-  args->bw[0] += busBw;
-  args->bw_count[0]++;
-  return testSuccess;
-}
-
-void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
-  int nranks = args->nProcs*args->nGpus*args->nThreads;
-  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
-
-  count = size / wordSize(type);
-  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
-
-  args->nbytes = paramCount * wordSize(type);
-  args->sendBytes = sendCount * wordSize(type);
-  args->expectedBytes = recvCount * wordSize(type);
-  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
-  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
-}
-
-testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
-  // Warm-up for large size
-  setupArgs(args->maxbytes, type, args);
-  for (int iter = 0; iter < warmup_iters; iter++) {
-    TESTCHECK(startColl(args, type, op, root, 0, iter));
-  }
-  TESTCHECK(completeColl(args));
-
-  // Warm-up for small size
-  setupArgs(args->minbytes, type, args);
-  for (int iter = 0; iter < warmup_iters; iter++) {
-    TESTCHECK(startColl(args, type, op, root, 0, iter));
-  }
-  TESTCHECK(completeColl(args));
-
-  // Benchmark
-  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
-      setupArgs(size, type, args);
-      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
-      TESTCHECK(BenchTime(args, type, op, root, 0));
-      TESTCHECK(BenchTime(args, type, op, root, 1));
-      PRINT("\n");
-  }
-  return testSuccess;
-}
-
-testResult_t threadRunTests(struct threadArgs* args) {
-  // Set device to the first of our GPUs. If we don't do that, some operations
-  // will be done on the current GPU (by default : 0) and if the GPUs are in
-  // exclusive mode those operations will fail.
-  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
-  CUDACHECK(cudaSetDevice(gpuid));
-  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
-  return testSuccess;
-}
-
-testResult_t threadInit(struct threadArgs* args) {
-  char hostname[1024];
-  getHostName(hostname, 1024);
-  int nranks =  args->nProcs*args->nThreads*args->nGpus;
-
-  //set main thread again
-  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
-
-  NCCLCHECK(ncclGroupStart());
-  for (int i=0; i<args->nGpus; i++) {
-    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
-    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
-  }
-  NCCLCHECK(ncclGroupEnd());
-
-  TESTCHECK(threadRunTests(args));
-
-  for (int i=0; i<args->nGpus; i++) {
-    NCCLCHECK(ncclCommDestroy(args->comms[i]));
-  }
-  return testSuccess;
-}
-
-void* threadLauncher(void* thread_) {
-  struct testThread* thread = (struct testThread*)thread_;
-  thread->ret = thread->func(&thread->args);
-  return NULL;
-}
-testResult_t threadLaunch(struct testThread* thread) {
-  pthread_create(&thread->thread, NULL, threadLauncher, thread);
-  return testSuccess;
-}
-
-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
-    CUDACHECK(cudaMalloc(sendbuff, nbytes));
-    CUDACHECK(cudaMalloc(recvbuff, nbytes));
-    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
-    return testSuccess;
-}
-
-testResult_t run(); // Main function
-
-int main(int argc, char* argv[]) {
-  // Make sure everyline is flushed so that we see the progress of the test
-  setlinebuf(stdout);
-
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-    ncclGetVersion(&test_ncclVersion);
-  #else
-    test_ncclVersion = NCCL_VERSION_CODE;
-  #endif
-  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
-    test_opnum = 4;
-    test_typenum = 9;
-    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
-      test_opnum++; // ncclAvg
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
-        test_typenum++; // bfloat16
-      #endif
-    }
-    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
-      test_opnum++; // PreMulSum
-    }
-  #endif
-
-  // Parse args
-  double parsed;
-  int longindex;
-  static struct option longopts[] = {
-    {"nthreads", required_argument, 0, 't'},
-    {"ngpus", required_argument, 0, 'g'},
-    {"minbytes", required_argument, 0, 'b'},
-    {"maxbytes", required_argument, 0, 'e'},
-    {"stepbytes", required_argument, 0, 'i'},
-    {"stepfactor", required_argument, 0, 'f'},
-    {"iters", required_argument, 0, 'n'},
-    {"agg_iters", required_argument, 0, 'm'},
-    {"warmup_iters", required_argument, 0, 'w'},
-    {"parallel_init", required_argument, 0, 'p'},
-    {"check", required_argument, 0, 'c'},
-    {"op", required_argument, 0, 'o'},
-    {"datatype", required_argument, 0, 'd'},
-    {"root", required_argument, 0, 'r'},
-    {"blocking", required_argument, 0, 'z'},
-    {"cudagraph", required_argument, 0, 'G'},
-    {"average", required_argument, 0, 'a'},
-    {"help", no_argument, 0, 'h'},
-    {}
-  };
-
-  while(1) {
-    int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
-
-    if (c == -1)
-      break;
-
-    switch(c) {
-      case 't':
-        nThreads = strtol(optarg, NULL, 0);
-        break;
-      case 'g':
-        nGpus = strtol(optarg, NULL, 0);
-        break;
-      case 'b':
-        parsed = parsesize(optarg);
-        if (parsed < 0) {
-          fprintf(stderr, "invalid size specified for 'minbytes'\n");
-          return -1;
-        }
-        minBytes = (size_t)parsed;
-        break;
-      case 'e':
-        parsed = parsesize(optarg);
-        if (parsed < 0) {
-          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
-          return -1;
-        }
-        maxBytes = (size_t)parsed;
-        break;
-      case 'i':
-        stepBytes = strtol(optarg, NULL, 0);
-        break;
-      case 'f':
-        stepFactor = strtol(optarg, NULL, 0);
-        break;
-      case 'n':
-        iters = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'm':
-#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
-        agg_iters = (int)strtol(optarg, NULL, 0);
-#else
-        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
-#endif
-        break;
-      case 'w':
-        warmup_iters = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'c':
-        datacheck = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'p':
-        parallel_init = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'o':
-        ncclop = ncclstringtoop(optarg);
-        break;
-      case 'd':
-        nccltype = ncclstringtotype(optarg);
-        break;
-      case 'r':
-        ncclroot = strtol(optarg, NULL, 0);
-        break;
-      case 'z':
-        blocking_coll = strtol(optarg, NULL, 0);
-        break;
-      case 'G':
-#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
-        cudaGraphLaunches = strtol(optarg, NULL, 0);
-#else
-        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
-#endif
-        break;
-      case 'a':
-        average = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'h':
-      default:
-        if (c != 'h') printf("invalid option '%c'\n", c);
-        printf("USAGE: %s \n\t"
-            "[-t,--nthreads <num threads>] \n\t"
-            "[-g,--ngpus <gpus per thread>] \n\t"
-            "[-b,--minbytes <min size in bytes>] \n\t"
-            "[-e,--maxbytes <max size in bytes>] \n\t"
-            "[-i,--stepbytes <increment size>] \n\t"
-            "[-f,--stepfactor <increment factor>] \n\t"
-            "[-n,--iters <iteration count>] \n\t"
-            "[-m,--agg_iters <aggregated iteration count>] \n\t"
-            "[-w,--warmup_iters <warmup iteration count>] \n\t"
-            "[-p,--parallel_init <0/1>] \n\t"
-            "[-c,--check <0/1>] \n\t"
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
-#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
-#else
-            "[-o,--op <sum/prod/min/max/all>] \n\t"
-#endif
-            "[-d,--datatype <nccltype/all>] \n\t"
-            "[-r,--root <root>] \n\t"
-            "[-z,--blocking <0/1>] \n\t"
-            "[-G,--cudagraph <num graph launches>] \n\t"
-            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
-            "[-h,--help]\n",
-	    basename(argv[0]));
-	return 0;
-    }
-  }
-  if (minBytes > maxBytes) {
-    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
-           (unsigned long long)minBytes,
-           (unsigned long long)maxBytes);
-    return -1;
-  }
-#ifdef MPI_SUPPORT
-  MPI_Init(&argc, &argv);
-#endif
-  TESTCHECK(run());
-  return 0;
-}
-
-testResult_t run() {
-  int nProcs = 1, proc = 0;
-  int localRank = 0;
-  char hostname[1024];
-  getHostName(hostname, 1024);
-
-#ifdef MPI_SUPPORT
-  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
-  uint64_t hostHashs[nProcs];
-  hostHashs[proc] = getHostHash(hostname);
-  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
-  for (int p=0; p<nProcs; p++) {
-    if (p == proc) break;
-    if (hostHashs[p] == hostHashs[proc]) localRank++;
-  }
-#endif
-  is_main_thread = (proc == 0) ? 1 : 0;
-
-  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
-      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
-  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
-  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
-  PRINT("#\n");
-
-  PRINT("# Using devices\n");
-#define MAX_LINE 2048
-  char line[MAX_LINE];
-  int len = 0;
-  size_t maxMem = ~0;
-  for (int i=0; i<nThreads*nGpus; i++) {
-    int cudaDev = localRank*nThreads*nGpus+i;
-    int rank = proc*nThreads*nGpus+i;
-    cudaDeviceProp prop;
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
-                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
-    maxMem = std::min(maxMem, prop.totalGlobalMem);
-  }
-
-#if MPI_SUPPORT
-  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
-  // Gather all output in rank order to root (0)
-  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
-  if (proc == 0) {
-    for (int p = 0; p < nProcs; p++)
-      PRINT("%s", lines+MAX_LINE*p);
-    free(lines);
-  }
-  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
-#else
-  PRINT("%s", line);
-#endif
-
-  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
-  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
-  if (maxBytes > memMaxBytes) {
-    maxBytes = memMaxBytes;
-    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
-  }
-
-  ncclUniqueId ncclId;
-  if (proc == 0) {
-    NCCLCHECK(ncclGetUniqueId(&ncclId));
-  }
-#ifdef MPI_SUPPORT
-  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
-  MPI_Barrier(MPI_COMM_WORLD);
-#endif
-  cudaStream_t streams[nGpus*nThreads];
-  void* sendbuffs[nGpus*nThreads];
-  void* recvbuffs[nGpus*nThreads];
-  void* expected[nGpus*nThreads];
-  size_t sendBytes, recvBytes;
-
-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
-
-  for (int i=0; i<nGpus*nThreads; i++) {
-    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
-    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
-  }
-
-  //if parallel init is not selected, use main thread to initialize NCCL
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
-  if (!parallel_init) {
-     if (nProcs == 1) {
-       int gpuArray[nGpus*nThreads];
-       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
-       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
-     } else {
-       NCCLCHECK(ncclGroupStart());
-       for (int i=0; i<nGpus*nThreads; i++) {
-         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
-       }
-       NCCLCHECK(ncclGroupEnd());
-     }
-  }
-
-  int errors[nThreads];
-  double bw[nThreads];
-  double* delta;
-  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
-  int bw_count[nThreads];
-  for (int t=0; t<nThreads; t++) {
-    bw[t] = 0.0;
-    errors[t] = bw_count[t] = 0;
-  }
-
-  PRINT("#\n");
-  print_header();
-
-  int* sync = (int*)calloc(2, sizeof(int));
-  int* barrier = (int*)calloc(2, sizeof(int));
-  double* reduce = (double*)calloc(2, sizeof(double));
-
-  struct testThread threads[nThreads];
-  memset(threads, 0, sizeof(struct testThread)*nThreads);
-
-  for (int t=nThreads-1; t>=0; t--) {
-    threads[t].args.minbytes=minBytes;
-    threads[t].args.maxbytes=maxBytes;
-    threads[t].args.stepbytes=stepBytes;
-    threads[t].args.stepfactor=stepFactor;
-    threads[t].args.localRank = localRank;
-
-    threads[t].args.nProcs=nProcs;
-    threads[t].args.proc=proc;
-    threads[t].args.nThreads=nThreads;
-    threads[t].args.thread=t;
-    threads[t].args.nGpus=nGpus;
-    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
-    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
-    threads[t].args.expected = expected+t*nGpus;
-    threads[t].args.ncclId = ncclId;
-    threads[t].args.comms=comms+t*nGpus;
-    threads[t].args.streams=streams+t*nGpus;
-
-    threads[t].args.barrier = (volatile int*)barrier;
-    threads[t].args.barrier_idx = 0;
-    threads[t].args.reduce = (volatile double*)reduce;
-    threads[t].args.sync = (volatile int*)sync;
-    threads[t].args.sync_idx = 0;
-    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
-    threads[t].args.errors=errors+t;
-    threads[t].args.bw=bw+t;
-    threads[t].args.bw_count=bw_count+t;
-
-    threads[t].args.reportErrors = 1;
-
-    threads[t].func = parallel_init ? threadInit : threadRunTests;
-    if (t)
-      TESTCHECK(threadLaunch(threads+t));
-    else
-      TESTCHECK(threads[t].func(&threads[t].args));
-  }
-
-  // Wait for other threads and accumulate stats and errors
-  for (int t=nThreads-1; t>=0; t--) {
-    if (t) pthread_join(threads[t].thread, NULL);
-    TESTCHECK(threads[t].ret);
-    if (t) {
-      errors[0] += errors[t];
-      bw[0] += bw[t];
-      bw_count[0] += bw_count[t];
-    }
-  }
-
-#ifdef MPI_SUPPORT
-  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-#endif
-
-  if (!parallel_init) {
-    for(int i=0; i<nGpus*nThreads; ++i)
-      NCCLCHECK(ncclCommDestroy(comms[i]));
-    free(comms);
-  }
-
-  // Free off CUDA allocated memory
-  for (int i=0; i<nGpus*nThreads; i++) {
-    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
-    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
-    if (datacheck) CUDACHECK(cudaFree(expected[i]));
-  }
-  CUDACHECK(cudaFreeHost(delta));
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  bw[0] /= bw_count[0];
-
-  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
-  PRINT("#\n");
-#ifdef MPI_SUPPORT
-  MPI_Finalize();
-#endif
-
-  // 'cuda-memcheck --leak-check full' requires this
-  cudaDeviceReset();
-
-  if (errors[0] || bw[0] < check_avg_bw*(0.9))
-    exit(EXIT_FAILURE);
-  else
-    exit(EXIT_SUCCESS);
-}
diff --git a/src_simple/common_simple.cu.simple b/src_simple/common_simple.cu.simple
deleted file mode 100644
index 8dfc98a..0000000
--- a/src_simple/common_simple.cu.simple
+++ /dev/null
@@ -1,1186 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "common_simple.h"
-#include <pthread.h>
-#include <cstdio>
-#include <getopt.h>
-#include <libgen.h>
-#include "cuda.h"
-
-int test_ncclVersion = 0; // init'd with ncclGetVersion()
-
-#if NCCL_MAJOR >= 2
-  ncclDataType_t test_types[ncclNumTypes] = {
-    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-    , ncclBfloat16
-  #endif
-  };
-  const char *test_typenames[ncclNumTypes] = {
-    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
-  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-    , "bfloat16"
-  #endif
-  };
-  int test_typenum = -1;
-
-  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
-  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-    , ncclAvg
-  #endif
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
-  #endif
-  };
-  int test_opnum = -1;
-#else
-  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
-  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
-  int test_typenum = 7;
-  const char *test_opnames[] = {"sum", "prod", "max", "min"};
-  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
-  int test_opnum = 4;
-#endif
-
-thread_local int is_main_thread = 0;
-
-// Command line parameter defaults
-static int nThreads = 1;
-static int nGpus = 1;
-static size_t minBytes = 32*1024*1024;
-static size_t maxBytes = 32*1024*1024;
-static size_t stepBytes = 1*1024*1024;
-static size_t stepFactor = 1;
-static int datacheck = 1;
-static int warmup_iters = 5;
-static int iters = 20;
-static int agg_iters = 1;
-static int ncclop = ncclSum;
-static int nccltype = ncclFloat;
-static int ncclroot = 0;
-static int parallel_init = 0;
-static int blocking_coll = 0;
-static int cudaGraphLaunches = 0;
-// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
-static int average = 1;
-
-#define NUM_BLOCKS 32
-
-static double parsesize(const char *value) {
-    long long int units;
-    double size;
-    char size_lit;
-
-    int count = sscanf(value, "%lf %1s", &size, &size_lit);
-
-    switch (count) {
-    case 2:
-      switch (size_lit) {
-      case 'G':
-      case 'g':
-        units = 1024*1024*1024;
-        break;
-      case 'M':
-      case 'm':
-        units = 1024*1024;
-        break;
-      case 'K':
-      case 'k':
-        units = 1024;
-        break;
-      default:
-        return -1.0;
-      };
-      break;
-    case 1:
-      units = 1;
-      break;
-    default:
-      return -1.0;
-    }
-
-    return size * units;
-}
-
-double DeltaMaxValue(ncclDataType_t type) {
-  switch(type) {
-    case ncclHalf: return 1e-2;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16: return 1e-2;
-#endif
-    case ncclFloat: return 1e-5;
-    case ncclDouble: return 1e-12;
-    case ncclInt:
-#if NCCL_MAJOR >= 2
-    case ncclUint8:
-    //case ncclInt32:
-    case ncclUint32:
-#endif
-    case ncclInt64:
-    case ncclUint64: return 1e-200;
-  }
-  return 1e-200;
-}
-
-template<typename T> __device__
-double absDiff(T a, T b) {
-  return fabs((double)(b - a));
-}
-
-template<> __device__
-double absDiff<half>(half a, half b) {
-  float x = __half2float(a);
-  float y = __half2float(b);
-  return fabs((double)(y-x));
-}
-
-template<typename T> __device__
-float toFloat(T a) {
-  return (float)a;
-}
-template<> __device__
-float toFloat(half a) {
-  return __half2float(a);
-}
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<> __device__
-float toFloat(__nv_bfloat16 a) {
-  return __bfloat162float(a);
-}
-#endif
-
-template<typename T, int BSIZE> __global__
-void deltaKern(void* A_, void* B_, size_t count, double* max) {
-  const T* A = (const T*)A_;
-  const T* B = (const T*)B_;
-  __shared__ double temp[BSIZE];
-  int tid = blockIdx.x*blockDim.x + threadIdx.x;
-  double locmax = 0.0;
-  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
-
-    double delta = absDiff(A[i], B[i]);
-    if( delta > locmax ) {
-      locmax = delta;
-#ifdef DEBUG_PRINT
-      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
-#endif
-    }
-  }
-
-  tid = threadIdx.x;
-  temp[tid] = locmax;
-  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
-    __syncthreads();
-    if( tid < stride )
-      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
-  }
-  __syncthreads();
-  if( threadIdx.x == 0)
-    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
-}
-
-testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
-  switch (type) {
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16:
-      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-#endif
-    case ncclHalf:
-      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclFloat:
-      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclDouble:
-      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-
-    case ncclChar:
-#if NCCL_MAJOR >= 2
-    case ncclUint8:
-#endif
-      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclInt:
-#if NCCL_MAJOR >= 2
-    case ncclUint32:
-#endif
-      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-    case ncclInt64:
-    case ncclUint64:
-      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
-  }
-  CUDACHECK(cudaDeviceSynchronize());
-  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
-  return testSuccess;
-}
-
-// For integer values, we use values between 0 and 255
-template<typename T>
-__device__ T testValue(const size_t offset, const int rep, const int rank) {
-  uint8_t v = (rep+rank+offset) % 256;
-  return (T)v;
-}
-
-// For floating point datatype, we use values between 0 and 1 otherwise the
-// Product operation will produce NaNs.
-template<>
-__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
-}
-template<>
-__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
-  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
-}
-template<>
-__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
-  return __float2half(testValue<float>(offset, rep, rank));
-}
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
-  return __float2bfloat16(testValue<float>(offset, rep, rank));
-}
-#endif
-
-// Operations
-template<typename T>
-__device__ T ncclOpSum(T a, T b) { return a+b; }
-template<typename T>
-__device__ T ncclOpProd(T a, T b) { return a*b; }
-template<typename T>
-__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
-template<typename T>
-__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
-
-// Definitions for half
-template<>
-__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
-template<>
-__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
-template<>
-__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
-template<>
-__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
-
-template<typename T>
-__device__ T ncclPPOpIdent(T x, int arg) { return x; }
-template<typename T>
-__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
-template<typename T>
-__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
-template<>
-__device__ half ncclPPOpMul(half x, int arg) {
-  return __float2half(__half2float(x)*float(arg));
-}
-template<>
-__device__ half ncclPPOpDiv(half x, int n) {
-  return __float2half(__half2float(x)/n);
-}
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
-  return __float2bfloat16(__bfloat162float(x)*float(arg));
-}
-template<>
-__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
-  return __float2bfloat16(__bfloat162float(x)/n);
-}
-#endif
-
-__host__ __device__ int preMulScalar(int rank) {
-  return 1 + rank%2;
-}
-
-template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
-__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
-    T val = testValue<T>(o+offset, rep, 0);
-    val = PreOp(val, preMulScalar(0));
-    for (int i=1; i<nranks; i++) {
-      T val1 = testValue<T>(o+offset, rep, i);
-      val1 = PreOp(val1, preMulScalar(i));
-      val = Op(val, val1);
-    }
-    data[o] = PostOp(val, nranks);
-  }
-}
-
-#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
-    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
-#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
-#else
-  #define OPS(type) \
-    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
-    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
-#endif
-
-static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
-  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  OPS(__nv_bfloat16)
-#endif
-};
-
-testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
-  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
-  return testSuccess;
-}
-
-template<typename T>
-__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
-  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
-    data[o] = testValue<T>(o, rep, rank);
-}
-
-static void* const initDataKerns[ncclNumTypes] = {
-  (void*)InitDataKernel<  int8_t>,
-  (void*)InitDataKernel< uint8_t>,
-  (void*)InitDataKernel< int32_t>,
-  (void*)InitDataKernel<uint32_t>,
-  (void*)InitDataKernel< int64_t>,
-  (void*)InitDataKernel<uint64_t>,
-  (void*)InitDataKernel<    half>,
-  (void*)InitDataKernel<   float>,
-  (void*)InitDataKernel<  double>,
-#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-  (void*)InitDataKernel<__nv_bfloat16>
-#endif
-};
-
-template<typename T>
-testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
-  T* ptr = (T*)dest;
-  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
-  return testSuccess;
-}
-
-testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
-  dim3 grid = { 32, 1, 1 };
-  dim3 block = { 256, 1, 1 };
-  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
-  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
-  return testSuccess;
-}
-
-void Barrier(struct threadArgs* args) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
-  args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-    args->barrier[args->barrier_idx] = 0;
-  } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
-  }
-  args->barrier_idx=!args->barrier_idx;
-}
-
-// Inter-thread/process barrier+allreduce
-void Allreduce(struct threadArgs* args, double* value, int average) {
-  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
-  double val = *value;
-  if (args->thread > 0) {
-    double val2 = args->reduce[args->barrier_idx];
-    if (average == 1) val += val2;
-    if (average == 2) val = std::min(val, val2);
-    if (average == 3) val = std::max(val, val2);
-  }
-  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
-  args->barrier[args->barrier_idx] = args->thread + 1;
-  if (args->thread+1 == args->nThreads) {
-#ifdef MPI_SUPPORT
-    if (average != 0) {
-      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
-      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
-    }
-#endif
-    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
-    args->reduce[1-args->barrier_idx] = 0;
-    args->barrier[args->barrier_idx] = 0;
-  } else {
-    while (args->barrier[args->barrier_idx]) pthread_yield();
-  }
-  *value = args->reduce[args->barrier_idx];
-  args->barrier_idx=!args->barrier_idx;
-}
-
-testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
-  size_t count = args->expectedBytes/wordSize(type);
-  double maxDelta = 0.0;
-  for (int i=0; i<args->nGpus; i++) {
-    int device;
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
-    CUDACHECK(cudaSetDevice(device));
-    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
-    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
-    maxDelta = std::max(*(args->deltaHost), maxDelta);
-
-#ifdef DEBUG_PRINT
-    if (rank == 0) {
-       int *expectedHost = (int *)malloc(args->expectedBytes);
-       int *dataHost = (int *)malloc(args->expectedBytes);
-
-       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
-       printf("\n Expected: ");
-       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, expectedHost[j]);
-       }
-       printf("\n");
-
-       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
-       printf("\n Actual: ");
-       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
-         printf("%d:%d ", j, dataHost[j]);
-       }
-       printf("\n");
-       free(expectedHost);
-       free(dataHost);
-    }
-#endif
-  }
-  double nranks = args->nProcs*args->nThreads*args->nGpus;
-  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
-  *delta = maxDelta;
-  return testSuccess;
-}
-
-testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
-  cudaError_t cudaErr;
-  int remaining = ngpus;
-  int* done = (int*)malloc(sizeof(int)*ngpus);
-  memset(done, 0, sizeof(int)*ngpus);
-  while (remaining) {
-   int idle = 1;
-   for (int i=0; i<ngpus; i++) {
-     if (done[i]) continue;
-
-     cudaErr = cudaStreamQuery(streams[i]);
-     if (cudaErr == cudaSuccess) {
-       done[i] = 1;
-       remaining--;
-       idle = 0;
-       continue;
-     }
-
-     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
-
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
-       ncclResult_t ncclAsyncErr;
-       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
-       if (ncclAsyncErr != ncclSuccess) {
-         // An asynchronous error happened. Stop the operation and destroy
-         // the communicator
-         for (int i=0; i<ngpus; i++)
-           NCCLCHECK(ncclCommAbort(comms[i]));
-         // Abort the perf test
-         NCCLCHECK(ncclAsyncErr);
-       }
-     }
-#endif
-   }
-
-   // We might want to let other threads (including NCCL threads) use the CPU.
-   if (idle) pthread_yield();
-  }
-  free(done);
-  return testSuccess;
-}
-
-testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
-  size_t count = args->nbytes / wordSize(type);
-
-  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
-  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
-  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
-  size_t shift = totalnbytes * (iter % steps);
-
-  if (args->nGpus > 1) {
-    // printf("startColl, args->nGpus > 1 run ncclGroupStart\n");
-    NCCLCHECK(ncclGroupStart());
-  }
-  for (int i = 0; i < args->nGpus; i++) {
-#ifndef NCCL_MAJOR
-    int cudaDev;
-    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
-    CUDACHECK(cudaSetDevice(cudaDev));
-#endif
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
-    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
-    ncclRedOp_t op;
-
-    if(opIndex < ncclNumOps) {
-      op = opIndex;
-    }
-    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-    else {
-      union {
-        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
-        half f16; float f32; double f64;
-        #if defined(__CUDA_BF16_TYPES_EXIST__)
-        __nv_bfloat16 bf16;
-        #endif
-      };
-      int scalar = preMulScalar(rank);
-      switch(type) {
-      case ncclInt8: i8 = int8_t(scalar); break;
-      case ncclUint8: u8 = uint8_t(scalar); break;
-      case ncclInt32: i32 = int32_t(scalar); break;
-      case ncclUint32: u32 = uint32_t(scalar); break;
-      case ncclInt64: i64 = int32_t(scalar); break;
-      case ncclUint64: u64 = uint32_t(scalar); break;
-      case ncclFloat16: f16 = __float2half(float(scalar)); break;
-      case ncclFloat32: f32 = float(scalar); break;
-      case ncclFloat64: f64 = double(scalar); break;
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
-      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
-      #endif
-      }
-      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
-    }
-    #endif
-
-    TESTCHECK(args->collTest->runColl(
-          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
-          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
-        count, type, op, root, args->comms[i], args->streams[i]));
-
-    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-    if(opIndex >= ncclNumOps) {
-      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
-    }
-    #endif
-  }
-  if (args->nGpus > 1) {
-    // printf("startColl, args->nGpus > 1 run ncclGroupEnd\n");
-    NCCLCHECK(ncclGroupEnd());
-  }
-
-  if (blocking_coll) {
-    // Complete op before returning
-    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
-  }
-  if (blocking_coll) Barrier(args);
-  return testSuccess;
-}
-
-testResult_t completeColl(struct threadArgs* args) {
-  if (blocking_coll) return testSuccess;
-
-  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
-  return testSuccess;
-}
-
-testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
-  size_t count = args->nbytes / wordSize(type);
-  if (datacheck) {
-    // Initialize sendbuffs, recvbuffs and expected
-    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
-  }
-
-  // Sync
-  TESTCHECK(startColl(args, type, op, root, in_place, 0));
-  TESTCHECK(completeColl(args));
-
-  Barrier(args);
-
-#if CUDART_VERSION >= 11030
-  cudaGraph_t graphs[args->nGpus];
-  cudaGraphExec_t graphExec[args->nGpus];
-  if (cudaGraphLaunches >= 1) {
-    // Begin cuda graph capture
-    for (int i=0; i<args->nGpus; i++) {
-      // Thread local mode is needed for:
-      // - Multi-thread mode
-      // - P2P pre-connect
-      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
-    }
-  }
-#endif
-
-  // Performance Benchmark
-  auto start = std::chrono::high_resolution_clock::now();
-  for (int iter = 0; iter < iters; iter++) {
-    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
-    for (int aiter = 0; aiter < agg_iters; aiter++) {
-      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
-    }
-    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
-  }
-
-// #if CUDART_VERSION >= 11030
-//   if (cudaGraphLaunches >= 1) {
-//     // End cuda graph capture
-//     for (int i=0; i<args->nGpus; i++) {
-//       CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
-//     }
-//     // Instantiate cuda graph
-//     for (int i=0; i<args->nGpus; i++) {
-//       CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
-//     }
-//     // Resync CPU, restart timing, launch cuda graph
-//     Barrier(args);
-//     start = std::chrono::high_resolution_clock::now();
-//     for (int l=0; l<cudaGraphLaunches; l++) {
-//       for (int i=0; i<args->nGpus; i++) {
-//         CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
-//       }
-//     }
-//   }
-// #endif
-
-  TESTCHECK(completeColl(args));
-
-  auto delta = std::chrono::high_resolution_clock::now() - start;
-  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
-  deltaSec = deltaSec/(iters*agg_iters);
-  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
-  Allreduce(args, &deltaSec, average);
-
-// #if CUDART_VERSION >= 11030
-//   if (cudaGraphLaunches >= 1) {
-//     //destroy cuda graph
-//     for (int i=0; i<args->nGpus; i++) {
-//       CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
-//       CUDACHECK(cudaGraphDestroy(graphs[i]));
-//     }
-//   }
-// #endif
-
-  double algBw, busBw;
-  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
-
-  Barrier(args);
-
-  double maxDelta = 0;
-  static __thread int rep = 0;
-  rep++;
-  if (datacheck) {
-      // Initialize sendbuffs, recvbuffs and expected
-      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
-
-      //test validation in single itertion, should ideally be included into the multi-iteration run
-      TESTCHECK(startColl(args, type, op, root, in_place, 0));
-
-      TESTCHECK(completeColl(args));
-
-      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
-
-      //aggregate delta from all threads and procs
-      Allreduce(args, &maxDelta, 3);
-  }
-
-  double timeUsec = deltaSec*1.0E6;
-  char timeStr[100];
-  if (timeUsec >= 10000.0) {
-    sprintf(timeStr, "%7.0f", timeUsec);
-  } else if (timeUsec >= 100.0) {
-    sprintf(timeStr, "%7.1f", timeUsec);
-  } else {
-    sprintf(timeStr, "%7.2f", timeUsec);
-  }
-  if (datacheck) {
-     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
-  } else {
-     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
-  }
-
-  args->bw[0] += busBw;
-  args->bw_count[0]++;
-  return testSuccess;
-}
-
-void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
-  int nranks = args->nProcs*args->nGpus*args->nThreads;
-  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
-
-  count = size / wordSize(type);
-  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
-
-  args->nbytes = paramCount * wordSize(type);
-  args->sendBytes = sendCount * wordSize(type);
-  args->expectedBytes = recvCount * wordSize(type);
-  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
-  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
-}
-
-testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
-  // // Warm-up for large size
-  // setupArgs(args->maxbytes, type, args);
-  // for (int iter = 0; iter < warmup_iters; iter++) {
-  //   TESTCHECK(startColl(args, type, op, root, 0, iter));
-  // }
-  // TESTCHECK(completeColl(args));
-
-  // // Warm-up for small size
-  // setupArgs(args->minbytes, type, args);
-  // for (int iter = 0; iter < warmup_iters; iter++) {
-  //   TESTCHECK(startColl(args, type, op, root, 0, iter));
-  // }
-  // TESTCHECK(completeColl(args));
-
-  // Benchmark
-  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
-      setupArgs(size, type, args);
-      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
-      TESTCHECK(BenchTime(args, type, op, root, 0));
-      // TESTCHECK(BenchTime(args, type, op, root, 1));
-      PRINT("\n");
-  }
-  return testSuccess;
-}
-
-testResult_t threadRunTests(struct threadArgs* args) {
-  // Set device to the first of our GPUs. If we don't do that, some operations
-  // will be done on the current GPU (by default : 0) and if the GPUs are in
-  // exclusive mode those operations will fail.
-  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
-  CUDACHECK(cudaSetDevice(gpuid));
-  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
-  return testSuccess;
-}
-
-testResult_t threadInit(struct threadArgs* args) {
-  char hostname[1024];
-  getHostName(hostname, 1024);
-  int nranks =  args->nProcs*args->nThreads*args->nGpus;
-
-  //set main thread again
-  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
-
-  NCCLCHECK(ncclGroupStart());
-  for (int i=0; i<args->nGpus; i++) {
-    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
-    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
-  }
-  NCCLCHECK(ncclGroupEnd());
-
-  TESTCHECK(threadRunTests(args));
-
-  for (int i=0; i<args->nGpus; i++) {
-    NCCLCHECK(ncclCommDestroy(args->comms[i]));
-  }
-  return testSuccess;
-}
-
-void* threadLauncher(void* thread_) {
-  struct testThread* thread = (struct testThread*)thread_;
-  thread->ret = thread->func(&thread->args);
-  return NULL;
-}
-testResult_t threadLaunch(struct testThread* thread) {
-  pthread_create(&thread->thread, NULL, threadLauncher, thread);
-  return testSuccess;
-}
-
-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
-    CUDACHECK(cudaMalloc(sendbuff, nbytes));
-    CUDACHECK(cudaMalloc(recvbuff, nbytes));
-    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
-    return testSuccess;
-}
-
-testResult_t run(); // Main function
-
-int main(int argc, char* argv[]) {
-  // Make sure everyline is flushed so that we see the progress of the test
-  setlinebuf(stdout);
-
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-    ncclGetVersion(&test_ncclVersion);
-  #else
-    test_ncclVersion = NCCL_VERSION_CODE;
-  #endif
-  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
-  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
-    test_opnum = 4;
-    test_typenum = 9;
-    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
-      test_opnum++; // ncclAvg
-      #if defined(__CUDA_BF16_TYPES_EXIST__)
-        test_typenum++; // bfloat16
-      #endif
-    }
-    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
-      test_opnum++; // PreMulSum
-    }
-  #endif
-
-  // Parse args
-  double parsed;
-  int longindex;
-  static struct option longopts[] = {
-    {"nthreads", required_argument, 0, 't'},
-    {"ngpus", required_argument, 0, 'g'},
-    {"minbytes", required_argument, 0, 'b'},
-    {"maxbytes", required_argument, 0, 'e'},
-    {"stepbytes", required_argument, 0, 'i'},
-    {"stepfactor", required_argument, 0, 'f'},
-    {"iters", required_argument, 0, 'n'},
-    {"agg_iters", required_argument, 0, 'm'},
-    {"warmup_iters", required_argument, 0, 'w'},
-    {"parallel_init", required_argument, 0, 'p'},
-    {"check", required_argument, 0, 'c'},
-    {"op", required_argument, 0, 'o'},
-    {"datatype", required_argument, 0, 'd'},
-    {"root", required_argument, 0, 'r'},
-    {"blocking", required_argument, 0, 'z'},
-    {"cudagraph", required_argument, 0, 'G'},
-    {"average", required_argument, 0, 'a'},
-    {"help", no_argument, 0, 'h'},
-    {}
-  };
-
-  while(1) {
-    int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
-
-    if (c == -1)
-      break;
-
-    switch(c) {
-      case 't':
-        nThreads = strtol(optarg, NULL, 0);
-        break;
-      case 'g':
-        nGpus = strtol(optarg, NULL, 0);
-        break;
-      case 'b':
-        parsed = parsesize(optarg);
-        if (parsed < 0) {
-          fprintf(stderr, "invalid size specified for 'minbytes'\n");
-          return -1;
-        }
-        minBytes = (size_t)parsed;
-        break;
-      case 'e':
-        parsed = parsesize(optarg);
-        if (parsed < 0) {
-          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
-          return -1;
-        }
-        maxBytes = (size_t)parsed;
-        break;
-      case 'i':
-        stepBytes = strtol(optarg, NULL, 0);
-        break;
-      case 'f':
-        stepFactor = strtol(optarg, NULL, 0);
-        break;
-      case 'n':
-        iters = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'm':
-#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
-        agg_iters = (int)strtol(optarg, NULL, 0);
-#else
-        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
-#endif
-        break;
-      case 'w':
-        warmup_iters = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'c':
-        datacheck = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'p':
-        parallel_init = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'o':
-        ncclop = ncclstringtoop(optarg);
-        break;
-      case 'd':
-        nccltype = ncclstringtotype(optarg);
-        break;
-      case 'r':
-        ncclroot = strtol(optarg, NULL, 0);
-        break;
-      case 'z':
-        blocking_coll = strtol(optarg, NULL, 0);
-        break;
-      case 'G':
-#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
-        cudaGraphLaunches = strtol(optarg, NULL, 0);
-#else
-        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
-#endif
-        break;
-      case 'a':
-        average = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'h':
-      default:
-        if (c != 'h') printf("invalid option '%c'\n", c);
-        printf("USAGE: %s \n\t"
-            "[-t,--nthreads <num threads>] \n\t"
-            "[-g,--ngpus <gpus per thread>] \n\t"
-            "[-b,--minbytes <min size in bytes>] \n\t"
-            "[-e,--maxbytes <max size in bytes>] \n\t"
-            "[-i,--stepbytes <increment size>] \n\t"
-            "[-f,--stepfactor <increment factor>] \n\t"
-            "[-n,--iters <iteration count>] \n\t"
-            "[-m,--agg_iters <aggregated iteration count>] \n\t"
-            "[-w,--warmup_iters <warmup iteration count>] \n\t"
-            "[-p,--parallel_init <0/1>] \n\t"
-            "[-c,--check <0/1>] \n\t"
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
-            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
-#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
-#else
-            "[-o,--op <sum/prod/min/max/all>] \n\t"
-#endif
-            "[-d,--datatype <nccltype/all>] \n\t"
-            "[-r,--root <root>] \n\t"
-            "[-z,--blocking <0/1>] \n\t"
-            "[-G,--cudagraph <num graph launches>] \n\t"
-            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
-            "[-h,--help]\n",
-	    basename(argv[0]));
-	return 0;
-    }
-  }
-  if (minBytes > maxBytes) {
-    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
-           (unsigned long long)minBytes,
-           (unsigned long long)maxBytes);
-    return -1;
-  }
-#ifdef MPI_SUPPORT
-  MPI_Init(&argc, &argv);
-#endif
-  TESTCHECK(run());
-  return 0;
-}
-
-testResult_t run() {
-  int nProcs = 1, proc = 0;
-  int localRank = 0;
-  char hostname[1024];
-  getHostName(hostname, 1024);
-
-#ifdef MPI_SUPPORT
-  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
-  uint64_t hostHashs[nProcs];
-  hostHashs[proc] = getHostHash(hostname);
-  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
-  for (int p=0; p<nProcs; p++) {
-    if (p == proc) break;
-    if (hostHashs[p] == hostHashs[proc]) localRank++;
-  }
-#endif
-  is_main_thread = (proc == 0) ? 1 : 0;
-
-  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
-      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
-  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
-  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
-  PRINT("#\n");
-
-  PRINT("# Using devices\n");
-#define MAX_LINE 2048
-  char line[MAX_LINE];
-  int len = 0;
-  size_t maxMem = ~0;
-  for (int i=0; i<nThreads*nGpus; i++) {
-    int cudaDev = localRank*nThreads*nGpus+i;
-    int rank = proc*nThreads*nGpus+i;
-    cudaDeviceProp prop;
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
-                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
-    maxMem = std::min(maxMem, prop.totalGlobalMem);
-  }
-
-#if MPI_SUPPORT
-  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
-  // Gather all output in rank order to root (0)
-  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
-  if (proc == 0) {
-    for (int p = 0; p < nProcs; p++)
-      PRINT("%s", lines+MAX_LINE*p);
-    free(lines);
-  }
-  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
-#else
-  PRINT("%s", line);
-#endif
-
-  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
-  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
-  if (maxBytes > memMaxBytes) {
-    maxBytes = memMaxBytes;
-    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
-  }
-
-  ncclUniqueId ncclId;
-  if (proc == 0) {
-    NCCLCHECK(ncclGetUniqueId(&ncclId));
-  }
-#ifdef MPI_SUPPORT
-  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
-  MPI_Barrier(MPI_COMM_WORLD);
-#endif
-  cudaStream_t streams[nGpus*nThreads];
-  void* sendbuffs[nGpus*nThreads];
-  void* recvbuffs[nGpus*nThreads];
-  void* expected[nGpus*nThreads];
-  size_t sendBytes, recvBytes;
-
-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
-
-  for (int i=0; i<nGpus*nThreads; i++) {
-    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
-    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
-  }
-
-  //if parallel init is not selected, use main thread to initialize NCCL
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
-  if (!parallel_init) {
-     if (nProcs == 1) {
-       int gpuArray[nGpus*nThreads];
-       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
-       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
-     } else {
-       NCCLCHECK(ncclGroupStart());
-       for (int i=0; i<nGpus*nThreads; i++) {
-         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
-       }
-       NCCLCHECK(ncclGroupEnd());
-     }
-  }
-
-  int errors[nThreads];
-  double bw[nThreads];
-  double* delta;
-  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
-  int bw_count[nThreads];
-  for (int t=0; t<nThreads; t++) {
-    bw[t] = 0.0;
-    errors[t] = bw_count[t] = 0;
-  }
-
-  PRINT("#\n");
-  print_header();
-
-  int* sync = (int*)calloc(2, sizeof(int));
-  int* barrier = (int*)calloc(2, sizeof(int));
-  double* reduce = (double*)calloc(2, sizeof(double));
-
-  struct testThread threads[nThreads];
-  memset(threads, 0, sizeof(struct testThread)*nThreads);
-
-  for (int t=nThreads-1; t>=0; t--) {
-    threads[t].args.minbytes=minBytes;
-    threads[t].args.maxbytes=maxBytes;
-    threads[t].args.stepbytes=stepBytes;
-    threads[t].args.stepfactor=stepFactor;
-    threads[t].args.localRank = localRank;
-
-    threads[t].args.nProcs=nProcs;
-    threads[t].args.proc=proc;
-    threads[t].args.nThreads=nThreads;
-    threads[t].args.thread=t;
-    threads[t].args.nGpus=nGpus;
-    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
-    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
-    threads[t].args.expected = expected+t*nGpus;
-    threads[t].args.ncclId = ncclId;
-    threads[t].args.comms=comms+t*nGpus;
-    threads[t].args.streams=streams+t*nGpus;
-
-    threads[t].args.barrier = (volatile int*)barrier;
-    threads[t].args.barrier_idx = 0;
-    threads[t].args.reduce = (volatile double*)reduce;
-    threads[t].args.sync = (volatile int*)sync;
-    threads[t].args.sync_idx = 0;
-    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
-    threads[t].args.errors=errors+t;
-    threads[t].args.bw=bw+t;
-    threads[t].args.bw_count=bw_count+t;
-
-    threads[t].args.reportErrors = 1;
-
-    threads[t].func = parallel_init ? threadInit : threadRunTests;
-    if (t)
-      TESTCHECK(threadLaunch(threads+t));
-    else
-      TESTCHECK(threads[t].func(&threads[t].args));
-  }
-
-  // Wait for other threads and accumulate stats and errors
-  for (int t=nThreads-1; t>=0; t--) {
-    if (t) pthread_join(threads[t].thread, NULL);
-    TESTCHECK(threads[t].ret);
-    if (t) {
-      errors[0] += errors[t];
-      bw[0] += bw[t];
-      bw_count[0] += bw_count[t];
-    }
-  }
-
-#ifdef MPI_SUPPORT
-  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-#endif
-
-  if (!parallel_init) {
-    for(int i=0; i<nGpus*nThreads; ++i)
-      NCCLCHECK(ncclCommDestroy(comms[i]));
-    free(comms);
-  }
-
-  // Free off CUDA allocated memory
-  for (int i=0; i<nGpus*nThreads; i++) {
-    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
-    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
-    if (datacheck) CUDACHECK(cudaFree(expected[i]));
-  }
-  CUDACHECK(cudaFreeHost(delta));
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  bw[0] /= bw_count[0];
-
-  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
-  PRINT("#\n");
-#ifdef MPI_SUPPORT
-  MPI_Finalize();
-#endif
-
-  // 'cuda-memcheck --leak-check full' requires this
-  cudaDeviceReset();
-
-  if (errors[0] || bw[0] < check_avg_bw*(0.9))
-    exit(EXIT_FAILURE);
-  else
-    exit(EXIT_SUCCESS);
-}

From 22b869ff290c24e3b955a75ae48bc4a4b0131f18 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 9 Sep 2022 08:13:28 +0000
Subject: [PATCH 026/109] wierd check

---
 src/Makefile                   | 40 ++++++++++++------
 src/common.cu                  |  2 +-
 src_simple/Makefile            | 40 ++++++++++++------
 src_simple/common_simple.cu    | 74 ++++++++++++++++++++--------------
 src_simple/ofccl_all_reduce.cu | 21 +++++++++-
 5 files changed, 118 insertions(+), 59 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 2a399db..8cee9d8 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -7,7 +7,7 @@
 CUDA_HOME ?= /usr/local/cuda
 PREFIX ?= /usr/local
 VERBOSE ?= 0
-DEBUG ?= 0
+DEBUG ?= 1
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -19,20 +19,32 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
-NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_80,code=sm_80 \
-                -gencode=arch=compute_80,code=compute_80
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
 else
-NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_70,code=compute_70
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
 endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
 
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 
@@ -72,6 +84,8 @@ endif
 LIBRARIES += nccl
 NVLDFLAGS += $(LIBRARIES:%=-l%)
 
+$(info CARDNAME $(NVCUFLAGS))
+
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
diff --git a/src/common.cu b/src/common.cu
index 72857cd..939e777 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -590,7 +590,7 @@ testResult_t completeColl(struct threadArgs* args) {
 
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
-  if (datacheck) {
+  if (datacheck) { // 这里的目的应该是让测带宽跑的coll也使用非0数据。
     // Initialize sendbuffs, recvbuffs and expected
     TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
   }
diff --git a/src_simple/Makefile b/src_simple/Makefile
index de282de..ccad131 100644
--- a/src_simple/Makefile
+++ b/src_simple/Makefile
@@ -7,7 +7,7 @@
 CUDA_HOME ?= /usr/local/cuda
 PREFIX ?= /usr/local
 VERBOSE ?= 0
-DEBUG ?= 0
+DEBUG ?= 1
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -19,20 +19,32 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
-NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_80,code=sm_80 \
-                -gencode=arch=compute_80,code=compute_80
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
 else
-NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_60,code=sm_60 \
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_70,code=sm_70 \
-                -gencode=arch=compute_70,code=compute_70
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
 endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
 
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 
@@ -72,6 +84,8 @@ endif
 LIBRARIES += nccl
 NVLDFLAGS += $(LIBRARIES:%=-l%)
 
+$(info CARDNAME $(NVCUFLAGS))
+
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 063664d..c01218a 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -216,7 +216,7 @@ __global__ void deltaKern(void *A_, void *B_, size_t count, double *max) {
 }
 
 testResult_t CheckDelta(void *results, void *expected, size_t count,
-                        ncclDataType_t type, double *devmax) {
+                        ncclDataType_t type, double *devmax, cudaStream_t stream) {
   switch (type) {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
   case ncclBfloat16:
@@ -281,7 +281,7 @@ __device__ float testValue<float>(const size_t offset, const int rep,
                                   const int rank) {
   // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
   // return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
-  return 1.0;
+  return 0.25;
 }
 template <>
 __device__ half testValue<half>(const size_t offset, const int rep,
@@ -437,8 +437,7 @@ testResult_t InitData(void *data, const size_t count, ncclDataType_t type,
   dim3 grid = {32, 1, 1};
   dim3 block = {256, 1, 1};
   void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank};
-  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0,
-                             cudaStreamDefault));
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
   return testSuccess;
 }
 
@@ -496,7 +495,7 @@ void Allreduce(struct threadArgs *args, double *value, int average) {
 }
 
 testResult_t CheckData(struct threadArgs *args, ncclDataType_t type,
-                       ncclRedOp_t op, int root, int in_place, double *delta) {
+                       ncclRedOp_t op, int root, int in_place, double *delta, cudaStream_t stream) { // 不要在默认stream上跑。
   size_t count = args->expectedBytes / wordSize(type);
   double maxDelta = 0.0;
   for (int i = 0; i < args->nGpus; i++) {
@@ -508,7 +507,7 @@ testResult_t CheckData(struct threadArgs *args, ncclDataType_t type,
                                       args->recvInplaceOffset * rank))
                           : args->recvbuffs[i];
     TESTCHECK(
-        CheckDelta(data, args->expected[i], count, type, args->deltaHost));
+        CheckDelta(data, args->expected[i], count, type, args->deltaHost, stream));
     maxDelta = std::max(*(args->deltaHost), maxDelta);
 
 #ifdef DEBUG_PRINT
@@ -516,15 +515,15 @@ testResult_t CheckData(struct threadArgs *args, ncclDataType_t type,
       int *expectedHost = (int *)malloc(args->expectedBytes);
       int *dataHost = (int *)malloc(args->expectedBytes);
 
-      cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes,
-                 cudaMemcpyDeviceToHost);
+      cudaMemcpyAsync(expectedHost, args->expected[0], args->expectedBytes,
+                 cudaMemcpyDeviceToHost, stream);
       printf("\n Expected: ");
       for (int j = 0; j < args->expectedBytes / sizeof(int); j++) {
         printf("%d:%d ", j, expectedHost[j]);
       }
       printf("\n");
 
-      cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+      cudaMemcpyAsync(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost, stream);
       printf("\n Actual: ");
       for (int j = 0; j < args->expectedBytes / sizeof(int); j++) {
         printf("%d:%d ", j, dataHost[j]);
@@ -818,15 +817,16 @@ testResult_t completeColl(struct threadArgs *args) {
 testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
 
   size_t count = args->nbytes / wordSize(type);
-  if (datacheck) {
-    // Initialize sendbuffs, recvbuffs and expected
-    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
-  }
+  // if (datacheck) {
+  //   // Initialize sendbuffs, recvbuffs and expected
+  //   TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
+  // }
 
   Barrier(args);
 
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
+  // TODO: 这里要支持多轮，好像也没有很复杂。
   for (int iter = 0; iter < iters; iter++) {
     for (int miter = 0; miter < multi_iters; miter++) {
       TESTCHECK(startColl(args, type, op, root, in_place,
@@ -851,33 +851,35 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   Barrier(args);
 
   double maxDelta = 0;
-  static __thread int rep = 0;
-  rep++;
 
   // IF_CHECK 如果要检查对错，把下边露出来
-  int printNum = 10;
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  float *ptr = (float *)malloc(printNum * sizeof(float));
-  cudaMemcpy(ptr, args->recvbuffs[0], printNum * sizeof(float), cudaMemcpyDeviceToHost);
-  for (int i = 0; i < printNum; i++) {
-    OFTEST_LOG(TEST, "<%lu> rank=%d, recvbuff[%d]=%f", pthread_self(), cudaDev, i, ptr[i]);
-  }
-  free(ptr);
+  // int printNum = 10;
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // float *ptr = (float *)malloc(printNum * sizeof(float));
+  // cudaMemcpy(ptr, args->recvbuffs[0], printNum * sizeof(float), cudaMemcpyDeviceToHost);
+  // for (int i = 0; i < printNum; i++) {
+  //   OFTEST_LOG(TEST, "<%lu> rank=%d, recvbuff[%d]=%f", pthread_self(), cudaDev, i, ptr[i]);
+  // }
+  // free(ptr);
 
   if (datacheck) {
-    // Initialize sendbuffs, recvbuffs and expected
-    TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
 
     //test validation in single itertion, should ideally be included into the multi-iteration run
-    TESTCHECK(startColl(args, type, op, root, in_place, 0, 0));
+    // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0)); // will set cbArgList[0].gotCqe = 0
+
+    // // // TESTCHECK(completeColl(args));
+    // pthread_mutex_lock(&cbArgList[0].mutex);
+    // while (cbArgList[0].gotCqe == 0) {
 
-    TESTCHECK(completeColl(args));
+    // }
+    // pthread_mutex_unlock(&cbArgList[0].mutex);
+  
 
-    TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+    // TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta, args->streams[0]));
 
-    //aggregate delta from all threads and procs
-    Allreduce(args, &maxDelta, 3);
+    // //aggregate delta from all threads and procs
+    // Allreduce(args, &maxDelta, 3);
   }
 
   double timeUsec = deltaSec * 1.0E6;
@@ -932,6 +934,16 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
       }
     }
 
+    // 在这里完成check数据的准备；
+    static __thread int rep = 0;
+    rep++;
+    if (datacheck) {
+      // Initialize sendbuffs, recvbuffs and expected
+      TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
+      int cudaDev;
+      CUDACHECK(cudaGetDevice(&cudaDev));
+    }
+    
     ofcclPrepareDone();
   }
 
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 2b336d4..b4af9bc 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -36,16 +36,34 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
   size_t recvcount = args->expectedBytes / wordSize(type);
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    
+    // OFTEST_LOG(TEST, "<%lu> rank=%d, AllReduceInitData get gpuid=%d", pthread_self(), cudaDev, gpuid);
+    
     CUDACHECK(cudaSetDevice(gpuid));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    // // OFTEST_LOG(TEST, "<%lu> rank=%d, AllReduceInitData get int rank=%d", pthread_self(), cudaDev, rank);
+
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    // // OFTEST_LOG(TEST, "<%lu> rank=%d, done cudaMemset", pthread_self(), cudaDev);
+
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    // OFTEST_LOG(TEST, "<%lu> rank=%d, done InitData", pthread_self(), cudaDev);
+
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    // // OFTEST_LOG(TEST, "<%lu> rank=%d, done InitDataReduce", pthread_self(), cudaDev);
+
     CUDACHECK(cudaDeviceSynchronize());
+
+    // OFTEST_LOG(TEST, "<%lu> rank=%d, done cudaDeviceSynchronize", pthread_self(), cudaDev);
+
   }
+  OFTEST_LOG(TEST, "<%lu> rank=%d, done AllReduceInitData", pthread_self(), cudaDev);
   return testSuccess;
 }
 
@@ -70,7 +88,7 @@ int myCallback(int collIdFromCqe, void *args) {
   pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
   ((CallBackArgs *)args)->gotCqe = 1;
   pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
-  // OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+  OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
   return 0;
 }
 
@@ -85,6 +103,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args));
   // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
   
   return testSuccess;
 }

From 732f8bdf4553aac21711e5d8307d3cad57e49b7e Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 9 Sep 2022 09:05:40 +0000
Subject: [PATCH 027/109] activate -n, can run multi-iters

---
 src_simple/common_simple.cu    | 18 +++++++++++-------
 src_simple/ofccl_all_reduce.cu | 20 +++-----------------
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index c01218a..bd3cb5e 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -792,8 +792,6 @@ testResult_t completeColl(struct threadArgs *args) {
   if (blocking_coll)
     return testSuccess;
     
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
   
   int gotCqeCnt = 0;
   while (gotCqeCnt < multi_iters) {
@@ -803,14 +801,18 @@ testResult_t completeColl(struct threadArgs *args) {
         if (seenCqe[i] == 0) {
           gotCqeCnt++;
           seenCqe[i] = 1;
+          
+          // int cudaDev;
+          // CUDACHECK(cudaGetDevice(&cudaDev));
+          // if (cudaDev == 0) {
+          // OFTEST_LOG(TEST, "<%lu> rank=%d, completeColl get cqe for collId %d", pthread_self(), cudaDev, i);
+          // }
+
         }
       }
       pthread_mutex_unlock(&cbArgList[i].mutex);
     }
-    // OFTEST_LOG(TEST, "<%lu> rank=%d, completeColl gotCqeCnt = %d", pthread_self(), cudaDev, gotCqeCnt);
   }
-
-  // TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
   return testSuccess;
 }
 
@@ -828,13 +830,15 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   auto start = std::chrono::high_resolution_clock::now();
   // TODO: 这里要支持多轮，好像也没有很复杂。
   for (int iter = 0; iter < iters; iter++) {
+
     for (int miter = 0; miter < multi_iters; miter++) {
+      seenCqe[miter] = 0;
       TESTCHECK(startColl(args, type, op, root, in_place,
                           iter * multi_iters + miter, miter));
     }
-  }
 
-  TESTCHECK(completeColl(args));
+    TESTCHECK(completeColl(args));
+  }
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
   double deltaSec =
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index b4af9bc..b7169f9 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -41,29 +41,15 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    
-    // OFTEST_LOG(TEST, "<%lu> rank=%d, AllReduceInitData get gpuid=%d", pthread_self(), cudaDev, gpuid);
-    
     CUDACHECK(cudaSetDevice(gpuid));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    // // OFTEST_LOG(TEST, "<%lu> rank=%d, AllReduceInitData get int rank=%d", pthread_self(), cudaDev, rank);
-
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    // // OFTEST_LOG(TEST, "<%lu> rank=%d, done cudaMemset", pthread_self(), cudaDev);
-
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    // OFTEST_LOG(TEST, "<%lu> rank=%d, done InitData", pthread_self(), cudaDev);
-
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    // // OFTEST_LOG(TEST, "<%lu> rank=%d, done InitDataReduce", pthread_self(), cudaDev);
-
     CUDACHECK(cudaDeviceSynchronize());
-
-    // OFTEST_LOG(TEST, "<%lu> rank=%d, done cudaDeviceSynchronize", pthread_self(), cudaDev);
-
   }
-  OFTEST_LOG(TEST, "<%lu> rank=%d, done AllReduceInitData", pthread_self(), cudaDev);
+  // OFTEST_LOG(TEST, "<%lu> rank=%d, done AllReduceInitData", pthread_self(), cudaDev);
   return testSuccess;
 }
 
@@ -88,7 +74,7 @@ int myCallback(int collIdFromCqe, void *args) {
   pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
   ((CallBackArgs *)args)->gotCqe = 1;
   pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
-  OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+  // OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
   return 0;
 }
 
@@ -103,7 +89,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args));
   // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
-  OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
   
   return testSuccess;
 }

From 85d5cbd40afa8fe354c57f791c206f9b8583e751 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 26 Sep 2022 02:48:47 +0000
Subject: [PATCH 028/109] + warmup

---
 src_simple/common_simple.cu | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index bd3cb5e..275f68a 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -828,7 +828,6 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
-  // TODO: 这里要支持多轮，好像也没有很复杂。
   for (int iter = 0; iter < iters; iter++) {
 
     for (int miter = 0; miter < multi_iters; miter++) {
@@ -952,26 +951,16 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
   }
 
   // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
-  // TODO: 同时如果要warmup的话，也要准备相应的callbackArgs。比较麻烦；可以考虑对比实验的时候，nccl和ofccl都不开warmup。
-  // Warm-up for large size
-  // setupArgs(args->maxbytes, type, args);
-  // for (int iter = 0; iter < warmup_iters; iter++) {
-  //     for (int miter = 0; miter < multi_iters; miter++) {
-  //       TESTCHECK(startColl(args, type, op, root, 0,
-  //                           iter * multi_iters + miter, miter));
-  //     }
-  // }
-  // TESTCHECK(completeColl(args));
-
-  // // Warm-up for small size
-  // setupArgs(args->minbytes, type, args);
-  // for (int iter = 0; iter < warmup_iters; iter++) {
-  //     for (int miter = 0; miter < multi_iters; miter++) {
-  //       TESTCHECK(startColl(args, type, op, root, 0,
-  //                           iter * multi_iters + miter, miter));
-  //     }
-  // }
-  // TESTCHECK(completeColl(args));
+  // warmup还是需要开，不然ofccl性能拉胯。
+  setupArgs(args->maxbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+      for (int miter = 0; miter < multi_iters; miter++) {
+        TESTCHECK(startColl(args, type, op, root, 0,
+                            iter * multi_iters + miter, miter));
+      }
+  }
+  TESTCHECK(completeColl(args));
+
 
   // Benchmark
   for (size_t size = args->minbytes; size <= args->maxbytes;
@@ -1415,6 +1404,11 @@ testResult_t run() {
   for (int t = nThreads - 1; t >= 0; t--) {
     threads[t].args.minbytes = minBytes;
     threads[t].args.maxbytes = maxBytes;
+    // TODO: 不支持多个size。
+    if (minBytes != maxBytes) {
+      OFTEST_LOG1(TEST_FATAL, "Only supports single size now");
+      return testInternalError;
+    }
     threads[t].args.stepbytes = stepBytes;
     threads[t].args.stepfactor = stepFactor;
     threads[t].args.localRank = localRank;

From 090185c0feaa996d2d83a74e4b36f3adb7170b8d Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 26 Sep 2022 07:55:32 +0000
Subject: [PATCH 029/109] fix completeColl in warmup

---
 src_simple/common_simple.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 275f68a..1f3e97b 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -954,13 +954,12 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
   // warmup还是需要开，不然ofccl性能拉胯。
   setupArgs(args->maxbytes, type, args);
   for (int iter = 0; iter < warmup_iters; iter++) {
-      for (int miter = 0; miter < multi_iters; miter++) {
-        TESTCHECK(startColl(args, type, op, root, 0,
-                            iter * multi_iters + miter, miter));
-      }
+    for (int miter = 0; miter < multi_iters; miter++) {
+      TESTCHECK(startColl(args, type, op, root, 0,
+                          iter * multi_iters + miter, miter));
+    }
+    TESTCHECK(completeColl(args));
   }
-  TESTCHECK(completeColl(args));
-
 
   // Benchmark
   for (size_t size = args->minbytes; size <= args->maxbytes;

From 17197fab885c1974f6cb9758ac6cf2955ea6db05 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 30 Sep 2022 07:16:45 +0000
Subject: [PATCH 030/109] try context

---
 src_simple/common_simple.cu    | 76 ++++++++++++++++++----------------
 src_simple/common_simple.h     |  4 +-
 src_simple/ofccl_all_reduce.cu |  8 ++--
 3 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 1f3e97b..d748ab8 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -589,7 +589,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams,
 }
 
 testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
-                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) {
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
   size_t count = args->nbytes / wordSize(type);
   if (args->nGpus != 1) {
     OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs");
@@ -664,7 +664,7 @@ testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
           &op, &u64, type, ncclScalarHostImmediate, comm));
     }
 #endif
-    TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter));
+    TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx));
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
     if (opIndex >= ncclNumOps) {
@@ -677,7 +677,7 @@ testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
 }
 
 testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
-                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter) {
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
   size_t count = args->nbytes / wordSize(type);
 
   // Try to change offset for each iteration so that we avoid cache effects and
@@ -766,7 +766,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
         (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank
                           : sendBuff),
         (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank
-                          : recvBuff), miter, cbArgList + miter));
+                          : recvBuff), miter, cbArgList + miter, rankCtx));
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
     if (opIndex >= ncclNumOps) {
@@ -816,7 +816,7 @@ testResult_t completeColl(struct threadArgs *args) {
   return testSuccess;
 }
 
-testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) {
 
   size_t count = args->nbytes / wordSize(type);
   // if (datacheck) {
@@ -833,7 +833,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
     for (int miter = 0; miter < multi_iters; miter++) {
       seenCqe[miter] = 0;
       TESTCHECK(startColl(args, type, op, root, in_place,
-                          iter * multi_iters + miter, miter));
+                          iter * multi_iters + miter, miter, rankCtx));
     }
 
     TESTCHECK(completeColl(args));
@@ -869,7 +869,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   if (datacheck) {
 
     //test validation in single itertion, should ideally be included into the multi-iteration run
-    // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0)); // will set cbArgList[0].gotCqe = 0
+    // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0, rankCtx)); // will set cbArgList[0].gotCqe = 0
 
     // // // TESTCHECK(completeColl(args));
     // pthread_mutex_lock(&cbArgList[0].mutex);
@@ -925,30 +925,36 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) {
 testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
                       const char *typeName, ncclRedOp_t op, const char *opName,
                       int root, bool is_ofccl) {
-  if (is_ofccl) {
-    // prepare for all size. op, type traversed in the caller.
-    // TODO: if we support multi size, each size should use a separate ncclComm
-    for (size_t size = args->minbytes; size <= args->maxbytes;
-        size = ((args->stepfactor > 1) ? size * args->stepfactor
-                                        : size + args->stepbytes)) {
-      setupArgs(size, type, args);
-      for (int miter = 0; miter < multi_iters; miter++) {
-        TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter));
-      }
+  // if (is_ofccl) {
+  // 首先创建ofcclRankCtx_t
+  int thrdCudaDev;
+  CUDACHECK(cudaGetDevice(&thrdCudaDev));
+  ofcclRankCtx_t rankCtx;
+  ofcclInitRankCtx(&rankCtx, thrdCudaDev);
+
+  // prepare for all size. op, type traversed in the caller.
+  // TODO: if we support multi size, each size should use a separate ncclComm
+  for (size_t size = args->minbytes; size <= args->maxbytes;
+      size = ((args->stepfactor > 1) ? size * args->stepfactor
+                                      : size + args->stepbytes)) {
+    setupArgs(size, type, args);
+    for (int miter = 0; miter < multi_iters; miter++) {
+      TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx));
     }
+  }
 
-    // 在这里完成check数据的准备；
-    static __thread int rep = 0;
-    rep++;
-    if (datacheck) {
-      // Initialize sendbuffs, recvbuffs and expected
-      TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
-      int cudaDev;
-      CUDACHECK(cudaGetDevice(&cudaDev));
-    }
-    
-    ofcclPrepareDone();
+  // 在这里完成check数据的准备；
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) {
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
+    int cudaDev;
+    CUDACHECK(cudaGetDevice(&cudaDev));
   }
+  
+  ofcclPrepareDone(rankCtx);
+  // }
 
   // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
   // warmup还是需要开，不然ofccl性能拉胯。
@@ -956,7 +962,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
   for (int iter = 0; iter < warmup_iters; iter++) {
     for (int miter = 0; miter < multi_iters; miter++) {
       TESTCHECK(startColl(args, type, op, root, 0,
-                          iter * multi_iters + miter, miter));
+                          iter * multi_iters + miter, miter, rankCtx));
     }
     TESTCHECK(completeColl(args));
   }
@@ -968,15 +974,15 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
     setupArgs(size, type, args);
     print_line_header(max(args->sendBytes, args->expectedBytes),
                       args->nbytes / wordSize(type), typeName, opName, root);
-    TESTCHECK(BenchTime(args, type, op, root, 0));
-    // TESTCHECK(BenchTime(args, type, op, root, 1));
+    TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
+    // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx));
     PRINT("\n");
   }
 
-  if (is_ofccl) {
-    // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self());
-    ofcclDestroy();
-  }
+  // if (is_ofccl) {
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self());
+  ofcclDestroy(rankCtx);
+  // }
 
   return testSuccess;
 }
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index bf2d0fd..dc75f47 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -81,8 +81,8 @@ struct testColl {
   testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
-  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args);
-  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx);
+  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index b7169f9..fa69a13 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -78,7 +78,7 @@ int myCallback(int collIdFromCqe, void *args) {
   return 0;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
 
@@ -87,16 +87,16 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   args->gotCqe = 0;
   pthread_mutex_init(&args->mutex, NULL);
 
-  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args));
+  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
   // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
   // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
   
   return testSuccess;
 }
 
-testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId) {
+testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
 
-  NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId));
+  NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx));
   // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
   return testSuccess;
 }

From 5f399fd4f4ef94d23ee038b08a41ce8965815a62 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 2 Oct 2022 14:22:20 +0000
Subject: [PATCH 031/109] bugfix: seenCqe[miter] = 0; in warmup

---
 src_simple/common_simple.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index d748ab8..dea54ce 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -961,6 +961,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
   setupArgs(args->maxbytes, type, args);
   for (int iter = 0; iter < warmup_iters; iter++) {
     for (int miter = 0; miter < multi_iters; miter++) {
+      seenCqe[miter] = 0;
       TESTCHECK(startColl(args, type, op, root, 0,
                           iter * multi_iters + miter, miter, rankCtx));
     }

From 5cd2cb8a0d702c3f1482691208c5fd3f03658f47 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 6 Oct 2022 15:03:41 +0000
Subject: [PATCH 032/109] polish callback

---
 src_simple/ofccl_all_reduce.cu | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index fa69a13..dec9d32 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -62,15 +62,15 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
 }
 
 int myCallback(int collIdFromCqe, void *args) {
-  // TODO: 不打log把这里删了，不然影响性能。
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  int collId = ((CallBackArgs *)args)->collId;
-  if (collId != collIdFromCqe) {
-    // TODO: more robust error handle.
-    OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
-    return -1;
-  }
+  // 不打log把这里删了，不然影响性能。
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
   pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
   ((CallBackArgs *)args)->gotCqe = 1;
   pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));

From 014862887f9dbdf800d22ba7b4786443f8c6137d Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 9 Oct 2022 07:47:04 +0000
Subject: [PATCH 033/109] check OK

---
 src_simple/common_simple.cu | 182 +++++++++++++++---------------------
 1 file changed, 73 insertions(+), 109 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index dea54ce..d193880 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -215,50 +215,35 @@ __global__ void deltaKern(void *A_, void *B_, size_t count, double *max) {
     max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
 }
 
-testResult_t CheckDelta(void *results, void *expected, size_t count,
-                        ncclDataType_t type, double *devmax, cudaStream_t stream) {
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
   switch (type) {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-  case ncclBfloat16:
-    deltaKern<__nv_bfloat16, 512>
-        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
-    break;
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
 #endif
-  case ncclHalf:
-    deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
-    break;
-  case ncclFloat:
-    deltaKern<float, 512>
-        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
-    break;
-  case ncclDouble:
-    deltaKern<double, 512>
-        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
-    break;
-
-  case ncclChar:
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
 #if NCCL_MAJOR >= 2
-  case ncclUint8:
+    case ncclUint8:
 #endif
-    deltaKern<uint8_t, 512>
-        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
-    break;
-  case ncclInt:
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
 #if NCCL_MAJOR >= 2
-  case ncclUint32:
+    case ncclUint32:
 #endif
-    deltaKern<uint32_t, 512>
-        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
-    break;
-  case ncclInt64:
-  case ncclUint64:
-    deltaKern<uint64_t, 512>
-        <<<NUM_BLOCKS, 512>>>(results, expected, count, devmax);
-    break;
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
   }
   CUDACHECK(cudaDeviceSynchronize());
-  for (int i = 1; i < NUM_BLOCKS; i++)
-    devmax[0] = std::max(devmax[0], devmax[i]);
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
   return testSuccess;
 }
 
@@ -281,7 +266,7 @@ __device__ float testValue<float>(const size_t offset, const int rep,
                                   const int rank) {
   // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
   // return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
-  return 0.25;
+  return 1.0 / 3.0;
 }
 template <>
 __device__ half testValue<half>(const size_t offset, const int rep,
@@ -494,53 +479,48 @@ void Allreduce(struct threadArgs *args, double *value, int average) {
   args->barrier_idx = !args->barrier_idx;
 }
 
-testResult_t CheckData(struct threadArgs *args, ncclDataType_t type,
-                       ncclRedOp_t op, int root, int in_place, double *delta, cudaStream_t stream) { // 不要在默认stream上跑。
-  size_t count = args->expectedBytes / wordSize(type);
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
   double maxDelta = 0.0;
-  for (int i = 0; i < args->nGpus; i++) {
+  for (int i=0; i<args->nGpus; i++) {
     int device;
-    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
     CUDACHECK(cudaSetDevice(device));
-    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] +
-                                      args->recvInplaceOffset * rank))
-                          : args->recvbuffs[i];
-    TESTCHECK(
-        CheckDelta(data, args->expected[i], count, type, args->deltaHost, stream));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
     maxDelta = std::max(*(args->deltaHost), maxDelta);
 
 #ifdef DEBUG_PRINT
     if (rank == 0) {
-      int *expectedHost = (int *)malloc(args->expectedBytes);
-      int *dataHost = (int *)malloc(args->expectedBytes);
-
-      cudaMemcpyAsync(expectedHost, args->expected[0], args->expectedBytes,
-                 cudaMemcpyDeviceToHost, stream);
-      printf("\n Expected: ");
-      for (int j = 0; j < args->expectedBytes / sizeof(int); j++) {
-        printf("%d:%d ", j, expectedHost[j]);
-      }
-      printf("\n");
-
-      cudaMemcpyAsync(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost, stream);
-      printf("\n Actual: ");
-      for (int j = 0; j < args->expectedBytes / sizeof(int); j++) {
-        printf("%d:%d ", j, dataHost[j]);
-      }
-      printf("\n");
-      free(expectedHost);
-      free(dataHost);
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
     }
 #endif
   }
-  double nranks = args->nProcs * args->nThreads * args->nGpus;
-  if (args->reportErrors && maxDelta > DeltaMaxValue(type) * (nranks - 1))
-    args->errors[0]++;
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
   *delta = maxDelta;
   return testSuccess;
 }
 
+
 testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams,
                                    ncclComm_t *comms) {
   cudaError_t cudaErr;
@@ -819,10 +799,6 @@ testResult_t completeColl(struct threadArgs *args) {
 testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) {
 
   size_t count = args->nbytes / wordSize(type);
-  // if (datacheck) {
-  //   // Initialize sendbuffs, recvbuffs and expected
-  //   TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
-  // }
 
   Barrier(args);
 
@@ -854,35 +830,21 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   Barrier(args);
 
   double maxDelta = 0;
-
-  // IF_CHECK 如果要检查对错，把下边露出来
-  // int printNum = 10;
-  // int cudaDev;
-  // CUDACHECK(cudaGetDevice(&cudaDev));
-  // float *ptr = (float *)malloc(printNum * sizeof(float));
-  // cudaMemcpy(ptr, args->recvbuffs[0], printNum * sizeof(float), cudaMemcpyDeviceToHost);
-  // for (int i = 0; i < printNum; i++) {
-  //   OFTEST_LOG(TEST, "<%lu> rank=%d, recvbuff[%d]=%f", pthread_self(), cudaDev, i, ptr[i]);
-  // }
-  // free(ptr);
-
+  // static __thread int rep = 0; // 为了再次初始化buffer的参数，没用了。
+  // rep++;
   if (datacheck) {
-
     //test validation in single itertion, should ideally be included into the multi-iteration run
-    // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0, rankCtx)); // will set cbArgList[0].gotCqe = 0
-
-    // // // TESTCHECK(completeColl(args));
-    // pthread_mutex_lock(&cbArgList[0].mutex);
-    // while (cbArgList[0].gotCqe == 0) {
-
-    // }
-    // pthread_mutex_unlock(&cbArgList[0].mutex);
-  
+    // seenCqe[0] = 0;
+    // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0, rankCtx));
+    // TESTCHECK(completeColl(args));
 
-    // TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta, args->streams[0]));
+    ofcclDestroy(rankCtx);
 
+    // TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
     // //aggregate delta from all threads and procs
     // Allreduce(args, &maxDelta, 3);
+  } else {
+    ofcclDestroy(rankCtx);
   }
 
   double timeUsec = deltaSec * 1.0E6;
@@ -946,11 +908,13 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
   // 在这里完成check数据的准备；
   static __thread int rep = 0;
   rep++;
-  if (datacheck) {
+  if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。
     // Initialize sendbuffs, recvbuffs and expected
     TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
-    int cudaDev;
-    CUDACHECK(cudaGetDevice(&cudaDev));
+    
+    // int cudaDev;
+    // CUDACHECK(cudaGetDevice(&cudaDev));
+    // OFTEST_LOG(TEST, "<%lu> rank=%d, initData OK", pthread_self(), cudaDev);
   }
   
   ofcclPrepareDone(rankCtx);
@@ -969,20 +933,20 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
   }
 
   // Benchmark
-  for (size_t size = args->minbytes; size <= args->maxbytes;
-       size = ((args->stepfactor > 1) ? size * args->stepfactor
-                                      : size + args->stepbytes)) {
-    setupArgs(size, type, args);
-    print_line_header(max(args->sendBytes, args->expectedBytes),
-                      args->nbytes / wordSize(type), typeName, opName, root);
-    TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
-    // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx));
-    PRINT("\n");
-  }
+  // for (size_t size = args->minbytes; size <= args->maxbytes;
+  //      size = ((args->stepfactor > 1) ? size * args->stepfactor
+  //                                     : size + args->stepbytes)) {
+  // setupArgs(size, type, args);
+  print_line_header(max(args->sendBytes, args->expectedBytes),
+                    args->nbytes / wordSize(type), typeName, opName, root);
+  TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
+  // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx));
+  PRINT("\n");
+  // }
 
   // if (is_ofccl) {
   // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self());
-  ofcclDestroy(rankCtx);
+  // ofcclDestroy(rankCtx); // 为了做check，把这个挪到BenchTime里边。
   // }
 
   return testSuccess;

From b6027be0584a4f7d64bef6aeae7664a4fd457484 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 9 Oct 2022 11:59:01 +0000
Subject: [PATCH 034/109] check ok

---
 src_simple/common_simple.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index d193880..622d94e 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -265,8 +265,8 @@ template <>
 __device__ float testValue<float>(const size_t offset, const int rep,
                                   const int rank) {
   // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
-  // return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
-  return 1.0 / 3.0;
+  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  // return 1.0 / 2.0;
 }
 template <>
 __device__ half testValue<half>(const size_t offset, const int rep,

From 24290c64e1a757c48d1e18b77096f23cbf9b6edf Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 9 Oct 2022 12:21:45 +0000
Subject: [PATCH 035/109] restore semi-original NCCL's BenchTime

---
 src/common.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 939e777..110d55a 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -597,8 +597,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   // Sync
   // TODO: 之后恢复？
-  // TESTCHECK(startColl(args, type, op, root, in_place, 0));
-  // TESTCHECK(completeColl(args));
+  TESTCHECK(startColl(args, type, op, root, in_place, 0));
+  TESTCHECK(completeColl(args));
 
   Barrier(args);
 

From d60903922c9a59e7aebf09e047e24d5e0938a315 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Wed, 12 Oct 2022 15:12:01 +0000
Subject: [PATCH 036/109] run check smoothly

---
 src_simple/common_simple.cu | 20 +++++++-------------
 src_simple/common_simple.h  |  2 ++
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 622d94e..2f01418 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -265,8 +265,8 @@ template <>
 __device__ float testValue<float>(const size_t offset, const int rep,
                                   const int rank) {
   // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
-  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
-  // return 1.0 / 2.0;
+  // return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  return 1.0 / 1.0;
 }
 template <>
 __device__ half testValue<half>(const size_t offset, const int rep,
@@ -829,22 +829,16 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
   Barrier(args);
 
+  ofcclDestroy(rankCtx);
+
   double maxDelta = 0;
   // static __thread int rep = 0; // 为了再次初始化buffer的参数，没用了。
   // rep++;
   if (datacheck) {
-    //test validation in single itertion, should ideally be included into the multi-iteration run
-    // seenCqe[0] = 0;
-    // TESTCHECK(startColl(args, type, op, root, in_place, 0, 0, rankCtx));
-    // TESTCHECK(completeColl(args));
-
-    ofcclDestroy(rankCtx);
 
-    // TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
-    // //aggregate delta from all threads and procs
-    // Allreduce(args, &maxDelta, 3);
-  } else {
-    ofcclDestroy(rankCtx);
+    TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+    //aggregate delta from all threads and procs
+    Allreduce(args, &maxDelta, 3);
   }
 
   double timeUsec = deltaSec * 1.0E6;
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index dc75f47..406f634 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -16,6 +16,8 @@
 #include <pthread.h>
 #include "nccl1_compat.h"
 
+// #define DEBUG_PRINT 1
+
 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
 #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
 #define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__)

From 4cd20919b8cfdc85dba915578e90cf3d0d94ad9c Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 13 Oct 2022 03:31:18 +0000
Subject: [PATCH 037/109] finalize check

---
 src/common.cu               | 45 -------------------------------------
 src_simple/common_simple.cu |  4 ++--
 2 files changed, 2 insertions(+), 47 deletions(-)

diff --git a/src/common.cu b/src/common.cu
index 110d55a..9c2588a 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -596,7 +596,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   }
 
   // Sync
-  // TODO: 之后恢复？
   TESTCHECK(startColl(args, type, op, root, in_place, 0));
   TESTCHECK(completeColl(args));
 
@@ -674,50 +673,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   static __thread int rep = 0;
   rep++;
   if (datacheck) {
-      // Initialize sendbuffs, recvbuffs and expected
-      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
-
-#if CUDART_VERSION >= 11030
-      if (cudaGraphLaunches >= 1) {
-        // Begin cuda graph capture for data check
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
-        }
-      }
-#endif
-
-      //test validation in single itertion, should ideally be included into the multi-iteration run
-      TESTCHECK(startColl(args, type, op, root, in_place, 0));
-
-#if CUDART_VERSION >= 11030
-      if (cudaGraphLaunches >= 1) {
-        // End cuda graph capture
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
-        }
-        // Instantiate cuda graph
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
-        }
-        // Launch cuda graph
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
-        }
-      }
-#endif
-
-      TESTCHECK(completeColl(args));
-
-#if CUDART_VERSION >= 11030
-      if (cudaGraphLaunches >= 1) {
-        //destroy cuda graph
-        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
-          CUDACHECK(cudaGraphDestroy(graphs[i]));
-        }
-      }
-#endif
-
       TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
 
       //aggregate delta from all threads and procs
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 2f01418..8168869 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -265,8 +265,8 @@ template <>
 __device__ float testValue<float>(const size_t offset, const int rep,
                                   const int rank) {
   // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
-  // return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
-  return 1.0 / 1.0;
+  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  // return 1.0 / 1.0;
 }
 template <>
 __device__ half testValue<half>(const size_t offset, const int rep,

From b8dc018fe3f14b2a217bf2ff453f8e2dfcccc62c Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 13 Oct 2022 15:26:11 +0000
Subject: [PATCH 038/109] adapt to volunteer quit

---
 src_simple/common_simple.cu    | 12 ++++++++----
 src_simple/ofccl_all_reduce.cu | 17 +++++++++--------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 622d94e..52f3174 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -785,7 +785,7 @@ testResult_t completeColl(struct threadArgs *args) {
           // int cudaDev;
           // CUDACHECK(cudaGetDevice(&cudaDev));
           // if (cudaDev == 0) {
-          // OFTEST_LOG(TEST, "<%lu> rank=%d, completeColl get cqe for collId %d", pthread_self(), cudaDev, i);
+          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for collId %d", pthread_self(), cudaDev, i);
           // }
 
         }
@@ -914,10 +914,14 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
     
     // int cudaDev;
     // CUDACHECK(cudaGetDevice(&cudaDev));
-    // OFTEST_LOG(TEST, "<%lu> rank=%d, initData OK", pthread_self(), cudaDev);
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), cudaDev);
   }
   
-  ofcclPrepareDone(rankCtx);
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclPrepareDone from TimeTest", pthread_self(), cudaDev);
+  ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
+  // ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
   // }
 
   // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
@@ -1239,7 +1243,7 @@ testResult_t run() {
   
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
-  OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
+  OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
 #define MAX_LINE 2048
   char line[MAX_LINE];
   int len = 0;
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index dec9d32..049b69c 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -49,7 +49,7 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
     CUDACHECK(cudaDeviceSynchronize());
   }
-  // OFTEST_LOG(TEST, "<%lu> rank=%d, done AllReduceInitData", pthread_self(), cudaDev);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev);
   return testSuccess;
 }
 
@@ -63,18 +63,19 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
 
 int myCallback(int collIdFromCqe, void *args) {
   // 不打log把这里删了，不然影响性能。
-  // int cudaDev;
-  // CUDACHECK(cudaGetDevice(&cudaDev));
-  // int collId = ((CallBackArgs *)args)->collId;
   // if (collId != collIdFromCqe) {
   //   // more robust error handle.
-  //   OFTEST_LOG(TEST_ERROR, "<%lu> rank=%d, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
   //   return -1;
   // }
   pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
   ((CallBackArgs *)args)->gotCqe = 1;
   pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
-  // OFTEST_LOG(TEST, "<%lu> rank=%d, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  int collId = ((CallBackArgs *)args)->collId;
+  OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
   return 0;
 }
 
@@ -88,8 +89,8 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   pthread_mutex_init(&args->mutex, NULL);
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
-  // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
-  // OFTEST_LOG(TEST, "<%lu> rank=%d, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
   
   return testSuccess;
 }

From bd105235de11cb9c3fe57201d297edf1dbab2b00 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 13 Oct 2022 19:47:22 +0000
Subject: [PATCH 039/109] adapt to volunteer quit

---
 src_simple/common_simple.cu    | 2 +-
 src_simple/ofccl_all_reduce.cu | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 9ed7393..55fc804 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -913,7 +913,6 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
   
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
-  OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclPrepareDone from TimeTest", pthread_self(), cudaDev);
   ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
   // ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
   // }
@@ -928,6 +927,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
                           iter * multi_iters + miter, miter, rankCtx));
     }
     TESTCHECK(completeColl(args));
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), cudaDev, iter, multi_iters);
   }
 
   // Benchmark
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 049b69c..0c5593b 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -72,10 +72,10 @@ int myCallback(int collIdFromCqe, void *args) {
   ((CallBackArgs *)args)->gotCqe = 1;
   pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
 
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  int collId = ((CallBackArgs *)args)->collId;
-  OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
   return 0;
 }
 

From 74d4f0def1d74bf94a67b80b231a56165a4d33af Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 14 Oct 2022 11:45:05 +0000
Subject: [PATCH 040/109] keep the report log

---
 src_simple/common_simple.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 55fc804..e1149ed 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -813,6 +813,10 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
     }
 
     TESTCHECK(completeColl(args));
+
+    int cudaDev;
+    cudaGetDevice(&cudaDev);
+    OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
   }
 
   auto delta = std::chrono::high_resolution_clock::now() - start;

From ed7f645ddb224dce418045ea3d6a9f4960ebc7da Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 17 Oct 2022 06:58:41 +0000
Subject: [PATCH 041/109] try pure inplace

---
 src_inplace/Makefile                          |  109 ++
 src_inplace/common_inplace.cu                 | 1477 +++++++++++++++++
 src_inplace/common_inplace.h                  |  289 ++++
 src_inplace/nccl1_compat.h                    |   50 +
 .../ofccl_all_reduce_inp.cu                   |   61 +-
 src_simple/all_reduce_group.cu                |  143 --
 src_simple/common_simple.cu                   |    4 +-
 7 files changed, 1980 insertions(+), 153 deletions(-)
 create mode 100644 src_inplace/Makefile
 create mode 100644 src_inplace/common_inplace.cu
 create mode 100644 src_inplace/common_inplace.h
 create mode 100644 src_inplace/nccl1_compat.h
 rename src_simple/all_reduce_simple.cu => src_inplace/ofccl_all_reduce_inp.cu (63%)
 delete mode 100644 src_simple/all_reduce_group.cu

diff --git a/src_inplace/Makefile b/src_inplace/Makefile
new file mode 100644
index 0000000..8b0e124
--- /dev/null
+++ b/src_inplace/Makefile
@@ -0,0 +1,109 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 1
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC = $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
+else
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
+endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
+endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
+LIBRARIES += nccl
+NVLDFLAGS += $(LIBRARIES:%=-l%)
+
+$(info CARDNAME $(NVCUFLAGS))
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := ofccl_all_reduce_inp
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+${DST_DIR}/%.o: %.cu common_inplace.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_inplace.o
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/src_inplace/common_inplace.cu b/src_inplace/common_inplace.cu
new file mode 100644
index 0000000..023030b
--- /dev/null
+++ b/src_inplace/common_inplace.cu
@@ -0,0 +1,1477 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_inplace.h"
+#include "cuda.h"
+#include "nccl.h"
+#include <cstdio>
+#include <cstring>
+#include <getopt.h>
+#include <libgen.h>
+#include <pthread.h>
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+#if NCCL_MAJOR >= 2
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8,
+                                           ncclUint8,
+                                           ncclInt32,
+                                           ncclUint32,
+                                           ncclInt64,
+                                           ncclUint64,
+                                           ncclHalf,
+                                           ncclFloat,
+                                           ncclDouble
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                           ,
+                                           ncclBfloat16
+#endif
+};
+const char *test_typenames[ncclNumTypes] = {"int8",
+                                            "uint8",
+                                            "int32",
+                                            "uint32",
+                                            "int64",
+                                            "uint64",
+                                            "half",
+                                            "float",
+                                            "double"
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                            ,
+                                            "bfloat16"
+#endif
+};
+int test_typenum = -1;
+
+const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+ncclRedOp_t test_ops[] = {
+    ncclSum,
+    ncclProd,
+    ncclMax,
+    ncclMin
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    ,
+    ncclAvg
+#endif
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    ,
+    ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+#endif
+};
+int test_opnum = -1;
+#else
+ncclDataType_t test_types[ncclNumTypes] = {
+    ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+const char *test_typenames[ncclNumTypes] = {"char",   "int",   "half",  "float",
+                                            "double", "int64", "uint64"};
+int test_typenum = 7;
+const char *test_opnames[] = {"sum", "prod", "max", "min"};
+ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32 * 1024 * 1024;
+static size_t maxBytes = 32 * 1024 * 1024;
+static size_t stepBytes = 1 * 1024 * 1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int multi_iters = 1;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static thread_local CallBackArgs cbArgList[MAX_COLL_NUM];
+static thread_local int seenCqe[MAX_COLL_NUM];
+
+static double parsesize(const char *value) {
+  long long int units;
+  double size;
+  char size_lit;
+
+  int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+  switch (count) {
+  case 2:
+    switch (size_lit) {
+    case 'G':
+    case 'g':
+      units = 1024 * 1024 * 1024;
+      break;
+    case 'M':
+    case 'm':
+      units = 1024 * 1024;
+      break;
+    case 'K':
+    case 'k':
+      units = 1024;
+      break;
+    default:
+      return -1.0;
+    };
+    break;
+  case 1:
+    units = 1;
+    break;
+  default:
+    return -1.0;
+  }
+
+  return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch (type) {
+  case ncclHalf:
+    return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+    return 1e-2;
+#endif
+  case ncclFloat:
+    return 1e-5;
+  case ncclDouble:
+    return 1e-12;
+  case ncclInt:
+#if NCCL_MAJOR >= 2
+  case ncclUint8:
+  // case ncclInt32:
+  case ncclUint32:
+#endif
+  case ncclInt64:
+  case ncclUint64:
+    return 1e-200;
+  }
+  return 1e-200;
+}
+
+template <typename T> __device__ double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template <> __device__ double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y - x));
+}
+
+template <typename T> __device__ float toFloat(T a) { return (float)a; }
+template <> __device__ float toFloat(half a) { return __half2float(a); }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template <typename T, int BSIZE>
+__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) {
+  const T *A = (const T *)A_;
+  const T *B = (const T *)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if (delta > locmax) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1)
+        printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i,
+               toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for (int stride = BSIZE / 2; stride > 1; stride >>= 1) {
+    __syncthreads();
+    if (tid < stride)
+      temp[tid] =
+          temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride];
+  }
+  __syncthreads();
+  if (threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template <typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep + rank + offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template <>
+__device__ double testValue<double>(const size_t offset, const int rep,
+                                    const int rank) {
+  return 1.0 / (1.0 + (double)testValue<int>(offset, rep, rank));
+}
+template <>
+__device__ float testValue<float>(const size_t offset, const int rep,
+                                  const int rank) {
+  // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
+  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  // return 1.0 / 1.0;
+}
+template <>
+__device__ half testValue<half>(const size_t offset, const int rep,
+                                const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset,
+                                                  const int rep,
+                                                  const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template <typename T> __device__ T ncclOpSum(T a, T b) { return a + b; }
+template <typename T> __device__ T ncclOpProd(T a, T b) { return a * b; }
+template <typename T> __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; }
+template <typename T> __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; }
+
+// Definitions for half
+template <> __device__ half ncclOpSum(half a, half b) {
+  return __float2half(__half2float(a) + __half2float(b));
+}
+template <> __device__ half ncclOpProd(half a, half b) {
+  return __float2half(__half2float(a) * __half2float(b));
+}
+template <> __device__ half ncclOpMax(half a, half b) {
+  return __half2float(a) > __half2float(b) ? a : b;
+}
+template <> __device__ half ncclOpMin(half a, half b) {
+  return __half2float(a) < __half2float(b) ? a : b;
+}
+
+template <typename T> __device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template <typename T> __device__ T ncclPPOpMul(T x, int arg) {
+  return x * T(arg);
+}
+template <typename T> __device__ T ncclPPOpDiv(T x, int arg) {
+  return x / T(arg);
+}
+template <> __device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x) * float(arg));
+}
+template <> __device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x) / n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x) * float(arg));
+}
+template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x) / n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; }
+
+template <typename T, T (*Op)(T, T), T (*PreOp)(T, int), T (*PostOp)(T, int)>
+__global__ void InitDataReduceKernel(T *data, const size_t N,
+                                     const size_t offset, const int rep,
+                                     const int nranks) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x) {
+    T val = testValue<T>(o + offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i = 1; i < nranks; i++) {
+      T val1 = testValue<T>(o + offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop)                                          \
+  (void *)InitDataReduceKernel<type, op<type>, preop<type>, postop<type>>
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv),               \
+      KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = {
+    OPS(int8_t),       OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t),
+    OPS(uint64_t),     OPS(half),    OPS(float),   OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void *data, const size_t count, const size_t offset,
+                            ncclDataType_t type, ncclRedOp_t op, const int rep,
+                            const int nranks) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep,
+                   (void *)&nranks};
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid,
+                             block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template <typename T>
+__global__ void InitDataKernel(T *data, const size_t N, const int rep,
+                               const int rank) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void *const initDataKerns[ncclNumTypes] = {
+    (void *)InitDataKernel<int8_t>,       (void *)InitDataKernel<uint8_t>,
+    (void *)InitDataKernel<int32_t>,      (void *)InitDataKernel<uint32_t>,
+    (void *)InitDataKernel<int64_t>,      (void *)InitDataKernel<uint64_t>,
+    (void *)InitDataKernel<half>,         (void *)InitDataKernel<float>,
+    (void *)InitDataKernel<double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    (void *)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template <typename T>
+testResult_t InitDataType(void *dest, const size_t N, const int rep,
+                          const int rank) {
+  T *ptr = (T *)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void *data, const size_t count, ncclDataType_t type,
+                      const int rep, const int rank) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank};
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs *args) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  args->barrier_idx = !args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs *args, double *value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1)
+      val += val2;
+    if (average == 2)
+      val = std::min(val, val2);
+    if (average == 3)
+      val = std::max(val, val2);
+  }
+  if (average || args->thread == 0)
+    args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1,
+                    MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1)
+      args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads;
+    args->reduce[1 - args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx = !args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams,
+                                   ncclComm_t *comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int *done = (int *)malloc(sizeof(int) * ngpus);
+  memset(done, 0, sizeof(int) * ngpus);
+  while (remaining) {
+    int idle = 1;
+    for (int i = 0; i < ngpus; i++) {
+      if (done[i])
+        continue;
+
+      cudaErr = cudaStreamQuery(streams[i]);
+      if (cudaErr == cudaSuccess) {
+        done[i] = 1;
+        remaining--;
+        idle = 0;
+        continue;
+      }
+
+      if (cudaErr != cudaErrorNotReady)
+        CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+      if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) {
+        ncclResult_t ncclAsyncErr;
+        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+        if (ncclAsyncErr != ncclSuccess) {
+          // An asynchronous error happened. Stop the operation and destroy
+          // the communicator
+          for (int i = 0; i < ngpus; i++)
+            NCCLCHECK(ncclCommAbort(comms[i]));
+          // Abort the perf test
+          NCCLCHECK(ncclAsyncErr);
+        }
+      }
+#endif
+    }
+
+    // We might want to let other threads (including NCCL threads) use the CPU.
+    if (idle)
+      pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+  if (args->nGpus != 1) {
+    OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs");
+    return testInternalError;
+  }
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
+
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    ncclRedOp_t op;
+    
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart");
+    NCCLCHECK(ncclGroupStart());
+  }
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm);
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(comm, &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    char *recvBuff = ((char *)args->recvbuffs[i]) + shift;
+    char *sendBuff = ((char *)args->sendbuffs[i]) + shift;
+    ncclRedOp_t op;
+
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    // miter就是collId。
+    TESTCHECK(args->collTest->runColl(
+        (void *)(in_place ? recvBuff + args->sendInplaceOffset * rank
+                          : sendBuff),
+        (void *)(in_place ? recvBuff + args->recvInplaceOffset * rank
+                          : recvBuff), miter, cbArgList + miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd");
+    NCCLCHECK(ncclGroupEnd());
+  }
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll)
+    Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs *args) {
+  if (blocking_coll)
+    return testSuccess;
+    
+  
+  int gotCqeCnt = 0;
+  while (gotCqeCnt < multi_iters) {
+    for (int i = 0; i < multi_iters; i++) {
+      pthread_mutex_lock(&cbArgList[i].mutex);
+      if (cbArgList[i].gotCqe == 1) {
+        if (seenCqe[i] == 0) {
+          gotCqeCnt++;
+          seenCqe[i] = 1;
+          
+          // int cudaDev;
+          // CUDACHECK(cudaGetDevice(&cudaDev));
+          // if (cudaDev == 0) {
+          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for collId %d", pthread_self(), cudaDev, i);
+          // }
+
+        }
+      }
+      pthread_mutex_unlock(&cbArgList[i].mutex);
+    }
+  }
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) {
+
+  size_t count = args->nbytes / wordSize(type);
+
+  Barrier(args);
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < iters; iter++) {
+
+    for (int miter = 0; miter < multi_iters; miter++) {
+      seenCqe[miter] = 0;
+      TESTCHECK(startColl(args, type, op, root, in_place,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+
+    TESTCHECK(completeColl(args));
+
+    int cudaDev;
+    cudaGetDevice(&cudaDev);
+    OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+  }
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec =
+      std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec / (iters * agg_iters *multi_iters);
+  if (cudaGraphLaunches >= 1)
+    deltaSec = deltaSec / cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw,
+                        args->nProcs * args->nThreads * args->nGpus);
+
+  Barrier(args);
+
+  ofcclDestroy(rankCtx);
+
+  double maxDelta = 0;
+  // static __thread int rep = 0; // 为了再次初始化buffer的参数，没用了。
+  // rep++;
+  if (datacheck) {
+
+    TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+    //aggregate delta from all threads and procs
+    Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec * 1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+    PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) {
+  int nranks = args->nProcs * args->nGpus * args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset,
+      recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount,
+                                   &sendInplaceOffset, &recvInplaceOffset,
+                                   (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
+                      const char *typeName, ncclRedOp_t op, const char *opName,
+                      int root, bool is_ofccl) {
+  // if (is_ofccl) {
+  // 首先创建ofcclRankCtx_t
+  int thrdCudaDev;
+  CUDACHECK(cudaGetDevice(&thrdCudaDev));
+  ofcclRankCtx_t rankCtx;
+  ofcclInitRankCtx(&rankCtx, thrdCudaDev);
+
+  // prepare for all size. op, type traversed in the caller.
+  // TODO: if we support multi size, each size should use a separate ncclComm
+  for (size_t size = args->minbytes; size <= args->maxbytes;
+      size = ((args->stepfactor > 1) ? size * args->stepfactor
+                                      : size + args->stepbytes)) {
+    setupArgs(size, type, args);
+    for (int miter = 0; miter < multi_iters; miter++) {
+      TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx));
+    }
+  }
+
+  // 在这里完成check数据的准备；
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
+    
+    // int cudaDev;
+    // CUDACHECK(cudaGetDevice(&cudaDev));
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), cudaDev);
+  }
+  
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
+  // ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
+  // }
+
+  // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
+  // warmup还是需要开，不然ofccl性能拉胯。
+  setupArgs(args->maxbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    for (int miter = 0; miter < multi_iters; miter++) {
+      seenCqe[miter] = 0;
+      TESTCHECK(startColl(args, type, op, root, 0,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+    TESTCHECK(completeColl(args));
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), cudaDev, iter, multi_iters);
+  }
+
+  // Benchmark
+  // for (size_t size = args->minbytes; size <= args->maxbytes;
+  //      size = ((args->stepfactor > 1) ? size * args->stepfactor
+  //                                     : size + args->stepbytes)) {
+  // setupArgs(size, type, args);
+  print_line_header(max(args->sendBytes, args->expectedBytes),
+                    args->nbytes / wordSize(type), typeName, opName, root);
+  // TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
+  TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边，所以没办法在这里通过调用两次BenchTime来先做out-of-place，再做in-place。像这样的话，可以在BenchTime里加个循环。
+  PRINT("\n");
+  // }
+
+  // if (is_ofccl) {
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self());
+  // ofcclDestroy(rankCtx); // 为了做check，把这个挪到BenchTime里边。
+  // }
+
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadRunTests");
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank * args->nThreads * args->nGpus +
+              args->thread * args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype,
+                                   test_typenames[nccltype],
+                                   (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadInit");
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks = args->nProcs * args->nThreads * args->nGpus;
+
+  // set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+    int rank = args->proc * args->nThreads * args->nGpus +
+               args->thread * args->nGpus + i;
+    int gpuid = args->localRank * args->nThreads * args->nGpus +
+                args->thread * args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    // OFTEST_LOG1(TEST, "CommInitRank here");
+    NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i = 0; i < args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void *threadLauncher(void *thread_) {
+  struct testThread *thread = (struct testThread *)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread *thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff,
+                           size_t recvBytes, void **expected, size_t nbytes,
+                           int nranks) {
+  CUDACHECK(cudaMalloc(sendbuff, nbytes));
+  // CUDACHECK(cudaMalloc(recvbuff, nbytes));
+  if (datacheck)
+    CUDACHECK(cudaMalloc(expected, recvBytes));
+  return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char *argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+  ncclGetVersion(&test_ncclVersion);
+#else
+  test_ncclVersion = NCCL_VERSION_CODE;
+#endif
+// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE,
+// test_ncclVersion);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0)
+  test_opnum = 4;
+  test_typenum = 9;
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 10, 0)) {
+    test_opnum++; // ncclAvg
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    test_typenum++; // bfloat16
+#endif
+  }
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 11, 0)) {
+    test_opnum++; // PreMulSum
+  }
+#endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+      {"nthreads", required_argument, 0, 't'},
+      {"ngpus", required_argument, 0, 'g'},
+      {"minbytes", required_argument, 0, 'b'},
+      {"maxbytes", required_argument, 0, 'e'},
+      {"stepbytes", required_argument, 0, 'i'},
+      {"stepfactor", required_argument, 0, 'f'},
+      {"iters", required_argument, 0, 'n'},
+      {"agg_iters", required_argument, 0, 'm'},
+      {"multi_iters", required_argument, 0, 'M'},
+      {"warmup_iters", required_argument, 0, 'w'},
+      {"parallel_init", required_argument, 0, 'p'},
+      {"check", required_argument, 0, 'c'},
+      {"op", required_argument, 0, 'o'},
+      {"datatype", required_argument, 0, 'd'},
+      {"root", required_argument, 0, 'r'},
+      {"blocking", required_argument, 0, 'z'},
+      {"cudagraph", required_argument, 0, 'G'},
+      {"average", required_argument, 0, 'a'},
+      {"help", no_argument, 0, 'h'},
+      {}};
+
+  while (1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts,
+                    &longindex);
+
+    if (c == -1)
+      break;
+
+    switch (c) {
+    case 't':
+      nThreads = strtol(optarg, NULL, 0);
+      break;
+    case 'g':
+      nGpus = strtol(optarg, NULL, 0);
+      break;
+    case 'b':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'minbytes'\n");
+        return -1;
+      }
+      minBytes = (size_t)parsed;
+      break;
+    case 'e':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+        return -1;
+      }
+      maxBytes = (size_t)parsed;
+      break;
+    case 'i':
+      stepBytes = strtol(optarg, NULL, 0);
+      break;
+    case 'f':
+      stepFactor = strtol(optarg, NULL, 0);
+      break;
+    case 'n':
+      iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'M':
+      multi_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+      agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+      fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+      break;
+    case 'w':
+      warmup_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'c':
+      datacheck = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'p':
+      parallel_init = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'o':
+      ncclop = ncclstringtoop(optarg);
+      break;
+    case 'd':
+      nccltype = ncclstringtotype(optarg);
+      break;
+    case 'r':
+      ncclroot = strtol(optarg, NULL, 0);
+      break;
+    case 'z':
+      blocking_coll = strtol(optarg, NULL, 0);
+      break;
+    case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) &&                \
+    CUDART_VERSION >= 11030
+      cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+      printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA "
+             "11.3. Ignoring\n");
+#endif
+      break;
+    case 'a':
+      average = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'h':
+    default:
+      if (c != 'h')
+        printf("invalid option '%c'\n", c);
+      printf("USAGE: %s \n\t"
+             "[-t,--nthreads <num threads>] \n\t"
+             "[-g,--ngpus <gpus per thread>] \n\t"
+             "[-b,--minbytes <min size in bytes>] \n\t"
+             "[-e,--maxbytes <max size in bytes>] \n\t"
+             "[-i,--stepbytes <increment size>] \n\t"
+             "[-f,--stepfactor <increment factor>] \n\t"
+             "[-n,--iters <iteration count>] \n\t"
+             "[-m,--agg_iters <aggregated iteration count>] \n\t"
+             "[-M,--multi_iters <multi seprate ncclComm iteration count>] \n\t"
+             "[-w,--warmup_iters <warmup iteration count>] \n\t"
+             "[-p,--parallel_init <0/1>] \n\t"
+             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+             "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+             "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+             "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+             "[-d,--datatype <nccltype/all>] \n\t"
+             "[-r,--root <root>] \n\t"
+             "[-z,--blocking <0/1>] \n\t"
+             "[-G,--cudagraph <num graph launches>] \n\t"
+             "[-a,--average <0/1/2/3> report average iteration time "
+             "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+             "[-h,--help]\n",
+             basename(argv[0]));
+      return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr,
+            "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+            (unsigned long long)minBytes, (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t),
+                MPI_BYTE, MPI_COMM_WORLD);
+  for (int p = 0; p < nProcs; p++) {
+    if (p == proc)
+      break;
+    if (hostHashs[p] == hostHashs[proc])
+      localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup "
+        "iters: %d iters: %d validation: %d \n",
+        nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1) ? stepFactor : stepBytes,
+        (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll)
+    PRINT("# Blocking Enabled: wait for completion and barrier after each "
+          "collective \n");
+  if (parallel_init)
+    PRINT("# Parallel Init Enabled: threads call into NcclInitRank "
+          "concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+  
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i = 0; i < nThreads * nGpus; i++) {
+    int cudaDev = localRank * nThreads * nGpus + i;
+    int rank = proc * nThreads * nGpus + i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len +=
+        snprintf(line + len, MAX_LINE - len,
+                 "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank,
+                 getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0,
+             MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines + MAX_LINE * p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for
+  // the rest.
+  // size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2);
+  // if (maxBytes > memMaxBytes) {
+  //   maxBytes = memMaxBytes;
+  //   if (proc == 0)
+  //     printf("#\n# Reducing maxBytes to %ld due to memory limitation\n",
+  //            maxBytes);
+  // }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus * nThreads];
+  void *sendbuffs[nGpus * nThreads];
+  void *recvbuffs[nGpus * nThreads];
+  void *expected[nGpus * nThreads];
+  size_t sendBytes, recvBytes;
+
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes,
+                             (size_t)nProcs * nGpus * nThreads);
+
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+    TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes,
+                            expected + i, (size_t)maxBytes,
+                            nProcs * nThreads * nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking));
+  }
+
+  // if parallel init is not selected, use main thread to initialize NCCL
+  // TODO: assign more comms when use multi size.
+  ncclComm_t *comms =
+      (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  ncclComm_t *adjusted_comms =
+    (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  if (!parallel_init) {
+    if (nProcs == 1) {
+      int gpuArray[nGpus * nThreads];
+      for (int i = 0; i < nGpus * nThreads; i++)
+        gpuArray[i] = i;
+      // OFTEST_LOG1(TEST, "CommInitAll here");
+      // use seprate comm
+      // TODO: we do not support MPI now.
+      for (int miter = 0; miter < multi_iters; miter++) {
+        NCCLCHECK(
+          ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray));
+        for (int tid = 0; tid < nThreads; tid++) {
+          memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus);
+        }
+      }
+      
+      // for (int miter = 0; miter < multi_iters; miter++) {
+      //   for (int tid = 0; tid < nThreads; tid++) {
+      //       OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus);
+      //   }
+      // }
+      // for (int tid = 0; tid < nThreads; tid++) {
+      //   for (int miter = 0; miter < multi_iters; miter++) {
+      //     OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus);
+      //   }
+      // }
+    } else {
+      NCCLCHECK(ncclGroupStart());
+      for (int i = 0; i < nGpus * nThreads; i++) {
+        CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+        //  OFTEST_LOG1(TEST, "CommInitRank here");
+        NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId,
+                                   proc * nThreads * nGpus + i));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double *delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS,
+                          cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t = 0; t < nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int *sync = (int *)calloc(2, sizeof(int));
+  int *barrier = (int *)calloc(2, sizeof(int));
+  double *reduce = (double *)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread) * nThreads);
+
+  for (int t = nThreads - 1; t >= 0; t--) {
+    threads[t].args.minbytes = minBytes;
+    threads[t].args.maxbytes = maxBytes;
+    // TODO: 不支持多个size。
+    if (minBytes != maxBytes) {
+      OFTEST_LOG1(TEST_FATAL, "Only supports single size now");
+      return testInternalError;
+    }
+    threads[t].args.stepbytes = stepBytes;
+    threads[t].args.stepfactor = stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs = nProcs;
+    threads[t].args.proc = proc;
+    threads[t].args.nThreads = nThreads;
+    threads[t].args.thread = t;
+    threads[t].args.nGpus = nGpus;
+    threads[t].args.sendbuffs = sendbuffs + t * nGpus;
+    threads[t].args.recvbuffs = sendbuffs + t * nGpus;
+    threads[t].args.expected = expected + t * nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus;
+    // for (int i = 0; i < multi_iters * nGpus; i++) {
+    //   OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i);
+    // }
+
+    threads[t].args.streams = streams + t * nGpus;
+
+    threads[t].args.barrier = (volatile int *)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double *)reduce;
+    threads[t].args.sync = (volatile int *)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t * NUM_BLOCKS);
+    threads[t].args.errors = errors + t;
+    threads[t].args.bw = bw + t;
+    threads[t].args.bw_count = bw_count + t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads + t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t = nThreads - 1; t >= 0; t--) {
+    if (t)
+      pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for (int i = 0; i < nGpus * nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    if (sendbuffs[i])
+      CUDACHECK(cudaFree((char *)sendbuffs[i]));
+    // if (recvbuffs[i])
+    //   CUDACHECK(cudaFree((char *)recvbuffs[i]));
+    if (datacheck)
+      CUDACHECK(cudaFree(expected[i]));
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char *str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0],
+        errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0],
+        check_avg_bw == -1 ? ""
+                           : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw * (0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/src_inplace/common_inplace.h b/src_inplace/common_inplace.h
new file mode 100644
index 0000000..406f634
--- /dev/null
+++ b/src_inplace/common_inplace.h
@@ -0,0 +1,289 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "nccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+
+// #define DEBUG_PRINT 1
+
+#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
+#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
+#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__)
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+typedef struct {
+  int collId;
+  int gotCqe;
+  pthread_mutex_t mutex;
+} CallBackArgs;
+
+#define MAX_COLL_NUM 10000
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx);
+  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int localRank;
+  void** sendbuffs;
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void** recvbuffs;
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expected;
+  size_t expectedBytes;
+  volatile int* sync;
+  int sync_idx;
+  volatile int* barrier;
+  int barrier_idx;
+  volatile double* reduce;
+  int syncRank;
+  int syncNranks;
+  double* deltaHost;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+#include <chrono>
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root, bool is_ofccl=false);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+
+// Provided by each coll
+extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
+extern void print_header();
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64: 
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+#endif
diff --git a/src_inplace/nccl1_compat.h b/src_inplace/nccl1_compat.h
new file mode 100644
index 0000000..020a4bc
--- /dev/null
+++ b/src_inplace/nccl1_compat.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
diff --git a/src_simple/all_reduce_simple.cu b/src_inplace/ofccl_all_reduce_inp.cu
similarity index 63%
rename from src_simple/all_reduce_simple.cu
rename to src_inplace/ofccl_all_reduce_inp.cu
index bdeeb48..9b9c95f 100644
--- a/src_simple/all_reduce_simple.cu
+++ b/src_inplace/ofccl_all_reduce_inp.cu
@@ -5,14 +5,18 @@
  ************************************************************************/
 
 #include "cuda_runtime.h"
-#include "common_simple.h"
+#include "common_inplace.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
 
 void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
   PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n");
   PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n");
 }
 
 void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
@@ -32,6 +36,9 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
   size_t recvcount = args->expectedBytes / wordSize(type);
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
     CUDACHECK(cudaSetDevice(gpuid));
@@ -42,6 +49,7 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
     CUDACHECK(cudaDeviceSynchronize());
   }
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev);
   return testSuccess;
 }
 
@@ -53,8 +61,44 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+  return 0;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+
+  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
   return testSuccess;
 }
 
@@ -63,7 +107,8 @@ struct testColl allReduceTest = {
   AllReduceGetCollByteCount,
   AllReduceInitData,
   AllReduceGetBw,
-  AllReduceRunColl
+  AllReduceRunColl,
+  AllReducePrepare
 };
 
 void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
@@ -100,7 +145,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
 
   for (int i=0; i<type_count; i++) {
     for (int j=0; j<op_count; j++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
     }
   }
   return testSuccess;
diff --git a/src_simple/all_reduce_group.cu b/src_simple/all_reduce_group.cu
deleted file mode 100644
index 9a702ec..0000000
--- a/src_simple/all_reduce_group.cu
+++ /dev/null
@@ -1,143 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "cuda_runtime.h"
-#include "common_simple.h"
-#include <stdio.h>
-#include <unistd.h>
-
-void print_header() {
-  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n");
-  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
-}
-
-void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = count;
-  *recvcount = count;
-  *sendInplaceOffset = 0;
-  *recvInplaceOffset = 0;
-  *paramcount = *sendcount;
-}
-
-testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
-  size_t sendcount = args->sendBytes / wordSize(type);
-  size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
-
-  for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    CUDACHECK(cudaSetDevice(gpuid));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, type, rep, rank));
-    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    CUDACHECK(cudaDeviceSynchronize());
-  }
-  return testSuccess;
-}
-
-void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
-  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
-
-  *algBw = baseBw;
-  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
-  *busBw = baseBw * factor;
-}
-
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  static int round;
-  ncclGroupStart();
-  printf("\n<%d> %d ofccl_nccl_test group start\n", getpid(), round);
-
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 1st allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 2nd allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 3rd allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 4th allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 5th allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 6th allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 7th allreduce\n", getpid(), round);
-  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
-  printf("<%d> %d ofccl_nccl_test 8th allreduce\n", getpid(), round);
-
-  ncclGroupEnd();
-  printf("<%d> %d ofccl_nccl_test group end\n", getpid(), round);
-  round++;
-  return testSuccess;
-}
-
-struct testColl allReduceTest = {
-  "AllReduce",
-  AllReduceGetCollByteCount,
-  AllReduceInitData,
-  AllReduceGetBw,
-  AllReduceRunColl
-};
-
-void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
-  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
-}
-
-testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
-  args->collTest = &allReduceTest;
-  // ncclDataType_t *run_types;
-  // ncclRedOp_t *run_ops;
-  // const char **run_typenames, **run_opnames;
-  // int type_count, op_count;
-
-  // if ((int)type != -1) {
-  //   type_count = 1;
-  //   run_types = &type;
-  //   run_typenames = &typeName;
-  // } else {
-  //   type_count = test_typenum;
-  //   run_types = test_types;
-  //   run_typenames = test_typenames;
-  // }
-
-  // if ((int)op != -1) {
-  //   op_count = 1;
-  //   run_ops = &op;
-  //   run_opnames = &opName;
-  // } else {
-  //   op_count = test_opnum;
-  //   run_ops = test_ops;
-  //   run_opnames = test_opnames;
-  // }
-
-  // for (int i=0; i<type_count; i++) {
-  //   for (int j=0; j<op_count; j++) {
-  //     TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
-  //   }
-  // }
-  static int test_round = 0;
-  printf("<%d> %d ofccl_nccl_test invoke TimeTest\n", getpid(), test_round);
-  test_round++;
-  TESTCHECK(TimeTest(args, ncclFloat, "float", ncclSum, "sum", -1));
-  return testSuccess;
-}
-
-struct testEngine allReduceEngine = {
-  AllReduceGetBuffSize,
-  AllReduceRunTest
-};
-
-#pragma weak ncclTestEngine=allReduceEngine
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index e1149ed..8fc3e4e 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -816,7 +816,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
     int cudaDev;
     cudaGetDevice(&cudaDev);
-    OFTEST_LOG(TEST_INIT, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+    OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
   }
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
@@ -942,7 +942,7 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
   print_line_header(max(args->sendBytes, args->expectedBytes),
                     args->nbytes / wordSize(type), typeName, opName, root);
   TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
-  // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx));
+  // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边，所以没办法在这里通过调用两次BenchTime来先做out-of-place，再做in-place。像这样的话，可以在BenchTime里加个循环。
   PRINT("\n");
   // }
 

From eed57ca6337a5f65f4804d1695fa18a349354f6a Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 17 Oct 2022 09:24:29 +0000
Subject: [PATCH 042/109] log format

---
 src_inplace/common_inplace.cu       | 2 +-
 src_inplace/ofccl_all_reduce_inp.cu | 4 ++--
 src_simple/common_simple.cu         | 2 +-
 src_simple/ofccl_all_reduce.cu      | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src_inplace/common_inplace.cu b/src_inplace/common_inplace.cu
index 023030b..4cb08c3 100644
--- a/src_inplace/common_inplace.cu
+++ b/src_inplace/common_inplace.cu
@@ -785,7 +785,7 @@ testResult_t completeColl(struct threadArgs *args) {
           // int cudaDev;
           // CUDACHECK(cudaGetDevice(&cudaDev));
           // if (cudaDev == 0) {
-          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for collId %d", pthread_self(), cudaDev, i);
+          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i);
           // }
 
         }
diff --git a/src_inplace/ofccl_all_reduce_inp.cu b/src_inplace/ofccl_all_reduce_inp.cu
index 9b9c95f..9123391 100644
--- a/src_inplace/ofccl_all_reduce_inp.cu
+++ b/src_inplace/ofccl_all_reduce_inp.cu
@@ -75,7 +75,7 @@ int myCallback(int collIdFromCqe, void *args) {
   // int cudaDev;
   // CUDACHECK(cudaGetDevice(&cudaDev));
   // int collId = ((CallBackArgs *)args)->collId;
-  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
   return 0;
 }
 
@@ -89,7 +89,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   pthread_mutex_init(&args->mutex, NULL);
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
-  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
   // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
   
   return testSuccess;
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 8fc3e4e..b889947 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -785,7 +785,7 @@ testResult_t completeColl(struct threadArgs *args) {
           // int cudaDev;
           // CUDACHECK(cudaGetDevice(&cudaDev));
           // if (cudaDev == 0) {
-          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for collId %d", pthread_self(), cudaDev, i);
+          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i);
           // }
 
         }
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 0c5593b..42c9628 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -75,7 +75,7 @@ int myCallback(int collIdFromCqe, void *args) {
   // int cudaDev;
   // CUDACHECK(cudaGetDevice(&cudaDev));
   // int collId = ((CallBackArgs *)args)->collId;
-  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for collId %d", pthread_self(), cudaDev, collId);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
   return 0;
 }
 
@@ -89,7 +89,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   pthread_mutex_init(&args->mutex, NULL);
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
-  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for collId %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
   // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
   
   return testSuccess;

From 0ef76cc20d6ed5098a9de517d6eb5890b710fce3 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Tue, 18 Oct 2022 14:45:24 +0000
Subject: [PATCH 043/109] manual buffer size done

---
 src_manual_size/Makefile               |  109 ++
 src_manual_size/common_ms.cu           | 1496 ++++++++++++++++++++++++
 src_manual_size/common_ms.h            |  292 +++++
 src_manual_size/nccl1_compat.h         |   50 +
 src_manual_size/ofccl_all_reduce_ms.cu |  173 +++
 src_simple/common_simple.cu            |   25 +-
 6 files changed, 2124 insertions(+), 21 deletions(-)
 create mode 100644 src_manual_size/Makefile
 create mode 100644 src_manual_size/common_ms.cu
 create mode 100644 src_manual_size/common_ms.h
 create mode 100644 src_manual_size/nccl1_compat.h
 create mode 100644 src_manual_size/ofccl_all_reduce_ms.cu

diff --git a/src_manual_size/Makefile b/src_manual_size/Makefile
new file mode 100644
index 0000000..ce42152
--- /dev/null
+++ b/src_manual_size/Makefile
@@ -0,0 +1,109 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 1
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC = $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
+else
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
+endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
+endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
+LIBRARIES += nccl
+NVLDFLAGS += $(LIBRARIES:%=-l%)
+
+$(info CARDNAME $(NVCUFLAGS))
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := ofccl_all_reduce_ms
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+${DST_DIR}/%.o: %.cu common_ms.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_ms.o
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
new file mode 100644
index 0000000..f240087
--- /dev/null
+++ b/src_manual_size/common_ms.cu
@@ -0,0 +1,1496 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_ms.h"
+#include "cuda.h"
+#include "nccl.h"
+#include <cstdio>
+#include <cstring>
+#include <getopt.h>
+#include <libgen.h>
+#include <pthread.h>
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+// TODO: 丑丑地搞个全局变量
+// size_t countList[MULTI_ITERS] = {4000, 8192000};
+size_t countList[MULTI_ITERS] = {4000, 8192000};
+size_t sendBytesList[MULTI_ITERS];
+size_t recvBytesList[MULTI_ITERS];
+
+#if NCCL_MAJOR >= 2
+ncclDataType_t test_types[ncclNumTypes] = {ncclInt8,
+                                           ncclUint8,
+                                           ncclInt32,
+                                           ncclUint32,
+                                           ncclInt64,
+                                           ncclUint64,
+                                           ncclHalf,
+                                           ncclFloat,
+                                           ncclDouble
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                           ,
+                                           ncclBfloat16
+#endif
+};
+const char *test_typenames[ncclNumTypes] = {"int8",
+                                            "uint8",
+                                            "int32",
+                                            "uint32",
+                                            "int64",
+                                            "uint64",
+                                            "half",
+                                            "float",
+                                            "double"
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+                                            ,
+                                            "bfloat16"
+#endif
+};
+int test_typenum = -1;
+
+const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+ncclRedOp_t test_ops[] = {
+    ncclSum,
+    ncclProd,
+    ncclMax,
+    ncclMin
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    ,
+    ncclAvg
+#endif
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    ,
+    ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+#endif
+};
+int test_opnum = -1;
+#else
+ncclDataType_t test_types[ncclNumTypes] = {
+    ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+const char *test_typenames[ncclNumTypes] = {"char",   "int",   "half",  "float",
+                                            "double", "int64", "uint64"};
+int test_typenum = 7;
+const char *test_opnames[] = {"sum", "prod", "max", "min"};
+ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32 * 1024 * 1024;
+static size_t maxBytes = 32 * 1024 * 1024;
+static size_t stepBytes = 1 * 1024 * 1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int multi_iters = MULTI_ITERS;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static thread_local CallBackArgs cbArgList[MAX_COLL_NUM];
+static thread_local int seenCqe[MAX_COLL_NUM];
+
+static double parsesize(const char *value) {
+  long long int units;
+  double size;
+  char size_lit;
+
+  int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+  switch (count) {
+  case 2:
+    switch (size_lit) {
+    case 'G':
+    case 'g':
+      units = 1024 * 1024 * 1024;
+      break;
+    case 'M':
+    case 'm':
+      units = 1024 * 1024;
+      break;
+    case 'K':
+    case 'k':
+      units = 1024;
+      break;
+    default:
+      return -1.0;
+    };
+    break;
+  case 1:
+    units = 1;
+    break;
+  default:
+    return -1.0;
+  }
+
+  return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch (type) {
+  case ncclHalf:
+    return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+    return 1e-2;
+#endif
+  case ncclFloat:
+    return 1e-5;
+  case ncclDouble:
+    return 1e-12;
+  case ncclInt:
+#if NCCL_MAJOR >= 2
+  case ncclUint8:
+  // case ncclInt32:
+  case ncclUint32:
+#endif
+  case ncclInt64:
+  case ncclUint64:
+    return 1e-200;
+  }
+  return 1e-200;
+}
+
+template <typename T> __device__ double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template <> __device__ double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y - x));
+}
+
+template <typename T> __device__ float toFloat(T a) { return (float)a; }
+template <> __device__ float toFloat(half a) { return __half2float(a); }
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template <typename T, int BSIZE>
+__global__ void deltaKern(void *A_, void *B_, size_t count, double *max) {
+  const T *A = (const T *)A_;
+  const T *B = (const T *)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for (size_t i = tid; i < count; i += blockDim.x * gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if (delta > locmax) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1)
+        printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B + i,
+               toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for (int stride = BSIZE / 2; stride > 1; stride >>= 1) {
+    __syncthreads();
+    if (tid < stride)
+      temp[tid] =
+          temp[tid] > temp[tid + stride] ? temp[tid] : temp[tid + stride];
+  }
+  __syncthreads();
+  if (threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template <typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep + rank + offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template <>
+__device__ double testValue<double>(const size_t offset, const int rep,
+                                    const int rank) {
+  return 1.0 / (1.0 + (double)testValue<int>(offset, rep, rank));
+}
+template <>
+__device__ float testValue<float>(const size_t offset, const int rep,
+                                  const int rank) {
+  // IF_CHECK 如果要检查对错，把第一个return注释掉，露出来第二个。
+  return 1.0 / (1.0 + (float)testValue<int>(offset, rep, rank));
+  // return 1.0 / 1.0;
+}
+template <>
+__device__ half testValue<half>(const size_t offset, const int rep,
+                                const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset,
+                                                  const int rep,
+                                                  const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template <typename T> __device__ T ncclOpSum(T a, T b) { return a + b; }
+template <typename T> __device__ T ncclOpProd(T a, T b) { return a * b; }
+template <typename T> __device__ T ncclOpMax(T a, T b) { return a > b ? a : b; }
+template <typename T> __device__ T ncclOpMin(T a, T b) { return a < b ? a : b; }
+
+// Definitions for half
+template <> __device__ half ncclOpSum(half a, half b) {
+  return __float2half(__half2float(a) + __half2float(b));
+}
+template <> __device__ half ncclOpProd(half a, half b) {
+  return __float2half(__half2float(a) * __half2float(b));
+}
+template <> __device__ half ncclOpMax(half a, half b) {
+  return __half2float(a) > __half2float(b) ? a : b;
+}
+template <> __device__ half ncclOpMin(half a, half b) {
+  return __half2float(a) < __half2float(b) ? a : b;
+}
+
+template <typename T> __device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template <typename T> __device__ T ncclPPOpMul(T x, int arg) {
+  return x * T(arg);
+}
+template <typename T> __device__ T ncclPPOpDiv(T x, int arg) {
+  return x / T(arg);
+}
+template <> __device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x) * float(arg));
+}
+template <> __device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x) / n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template <> __device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x) * float(arg));
+}
+template <> __device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x) / n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) { return 1 + rank % 2; }
+
+template <typename T, T (*Op)(T, T), T (*PreOp)(T, int), T (*PostOp)(T, int)>
+__global__ void InitDataReduceKernel(T *data, const size_t N,
+                                     const size_t offset, const int rep,
+                                     const int nranks) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x) {
+    T val = testValue<T>(o + offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i = 1; i < nranks; i++) {
+      T val1 = testValue<T>(o + offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop)                                          \
+  (void *)InitDataReduceKernel<type, op<type>, preop<type>, postop<type>>
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv),               \
+      KERN(type, ncclOpSum /*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpSum /*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+#define OPS(type)                                                              \
+  KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent),                         \
+      KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent),                    \
+      KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent),                     \
+      KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void *const redInitDataKerns[test_opNumMax * ncclNumTypes] = {
+    OPS(int8_t),       OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t),
+    OPS(uint64_t),     OPS(half),    OPS(float),   OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void *data, const size_t count, const size_t offset,
+                            ncclDataType_t type, ncclRedOp_t op, const int rep,
+                            const int nranks) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[5] = {(void *)&data, (void *)&count, (void *)&offset, (void *)&rep,
+                   (void *)&nranks};
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type * test_opNumMax + op], grid,
+                             block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template <typename T>
+__global__ void InitDataKernel(T *data, const size_t N, const int rep,
+                               const int rank) {
+  for (size_t o = blockIdx.x * blockDim.x + threadIdx.x; o < N;
+       o += gridDim.x * blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void *const initDataKerns[ncclNumTypes] = {
+    (void *)InitDataKernel<int8_t>,       (void *)InitDataKernel<uint8_t>,
+    (void *)InitDataKernel<int32_t>,      (void *)InitDataKernel<uint32_t>,
+    (void *)InitDataKernel<int64_t>,      (void *)InitDataKernel<uint64_t>,
+    (void *)InitDataKernel<half>,         (void *)InitDataKernel<float>,
+    (void *)InitDataKernel<double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+    (void *)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template <typename T>
+testResult_t InitDataType(void *dest, const size_t N, const int rep,
+                          const int rank) {
+  T *ptr = (T *)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void *data, const size_t count, ncclDataType_t type,
+                      const int rep, const int rank) {
+  dim3 grid = {32, 1, 1};
+  dim3 block = {256, 1, 1};
+  void *args[4] = {(void *)&data, (void *)&count, (void *)&rep, (void *)&rank};
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs *args) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  args->barrier_idx = !args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs *args, double *value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread)
+    pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1)
+      val += val2;
+    if (average == 2)
+      val = std::min(val, val2);
+    if (average == 3)
+      val = std::max(val, val2);
+  }
+  if (average || args->thread == 0)
+    args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread + 1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void *)&args->reduce[args->barrier_idx], 1,
+                    MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1)
+      args->reduce[args->barrier_idx] /= args->nProcs * args->nThreads;
+    args->reduce[1 - args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx])
+      pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx = !args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t *streams,
+                                   ncclComm_t *comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int *done = (int *)malloc(sizeof(int) * ngpus);
+  memset(done, 0, sizeof(int) * ngpus);
+  while (remaining) {
+    int idle = 1;
+    for (int i = 0; i < ngpus; i++) {
+      if (done[i])
+        continue;
+
+      cudaErr = cudaStreamQuery(streams[i]);
+      if (cudaErr == cudaSuccess) {
+        done[i] = 1;
+        remaining--;
+        idle = 0;
+        continue;
+      }
+
+      if (cudaErr != cudaErrorNotReady)
+        CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+      if (test_ncclVersion >= NCCL_VERSION(2, 4, 0) && comms) {
+        ncclResult_t ncclAsyncErr;
+        NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+        if (ncclAsyncErr != ncclSuccess) {
+          // An asynchronous error happened. Stop the operation and destroy
+          // the communicator
+          for (int i = 0; i < ngpus; i++)
+            NCCLCHECK(ncclCommAbort(comms[i]));
+          // Abort the perf test
+          NCCLCHECK(ncclAsyncErr);
+        }
+      }
+#endif
+    }
+
+    // We might want to let other threads (including NCCL threads) use the CPU.
+    if (idle)
+      pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+  if (args->nGpus != 1) {
+    OFTEST_LOG1(TESTERR, "prepareColl cannot handle multiple GPUs");
+    return testInternalError;
+  }
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
+
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    ncclRedOp_t op;
+    
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
+                       ncclRedOp_t opIndex, int root, int in_place, int iter, int miter, ofcclRankCtx_t rankCtx) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and
+  // catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupStart");
+    NCCLCHECK(ncclGroupStart());
+  }
+  for (int i = 0; i < args->nGpus; i++) {
+    ncclComm_t comm = args->comms[miter * nGpus + i];
+    // OFTEST_LOG(TEST, "commIndex=%d, comm=%p", miter * nGpus + i, comm);
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(comm, &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc * args->nThreads + args->thread) * args->nGpus + i);
+    // char *recvBuff = ((char *)args->recvbuffs[i]) + shift;
+    // char *sendBuff = ((char *)args->sendbuffs[i]) + shift;
+    char *recvBuff = (char *)(args->recvbuffs[miter]);
+    char *sendBuff = (char *)(args->sendbuffs[miter]);
+    
+    // int cudaDev;
+    // cudaGetDevice(&cudaDev);
+    // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, RUN sendbuff @ %p, recvbuff @ %p", cudaDev, miter, sendBuff, recvBuff);
+
+    ncclRedOp_t op;
+
+    if (opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    else {
+      union {
+        int8_t i8;
+        uint8_t u8;
+        int32_t i32;
+        uint32_t u32;
+        int64_t i64;
+        uint64_t u64;
+        half f16;
+        float f32;
+        double f64;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+#endif
+      };
+      int scalar = preMulScalar(rank);
+      switch (type) {
+      case ncclInt8:
+        i8 = int8_t(scalar);
+        break;
+      case ncclUint8:
+        u8 = uint8_t(scalar);
+        break;
+      case ncclInt32:
+        i32 = int32_t(scalar);
+        break;
+      case ncclUint32:
+        u32 = uint32_t(scalar);
+        break;
+      case ncclInt64:
+        i64 = int32_t(scalar);
+        break;
+      case ncclUint64:
+        u64 = uint32_t(scalar);
+        break;
+      case ncclFloat16:
+        f16 = __float2half(float(scalar));
+        break;
+      case ncclFloat32:
+        f32 = float(scalar);
+        break;
+      case ncclFloat64:
+        f64 = double(scalar);
+        break;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16:
+        bf16 = __float2bfloat16(float(scalar));
+        break;
+#endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(
+          &op, &u64, type, ncclScalarHostImmediate, comm));
+    }
+#endif
+    // miter就是collId。
+    TESTCHECK(args->collTest->runColl(
+        (void *)(sendBuff),
+        (void *)(recvBuff), miter, cbArgList + miter, rankCtx));
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+    if (opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, comm));
+    }
+#endif
+  }
+  if (args->nGpus > 1) {
+    // OFTEST_LOG1(TEST, "startColl, args->nGpus > 1 run ncclGroupEnd");
+    NCCLCHECK(ncclGroupEnd());
+  }
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll)
+    Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs *args) {
+  if (blocking_coll)
+    return testSuccess;
+    
+  
+  int gotCqeCnt = 0;
+  while (gotCqeCnt < multi_iters) {
+    for (int i = 0; i < multi_iters; i++) {
+      pthread_mutex_lock(&cbArgList[i].mutex);
+      if (cbArgList[i].gotCqe == 1) {
+        if (seenCqe[i] == 0) {
+          gotCqeCnt++;
+          seenCqe[i] = 1;
+          
+          // int cudaDev;
+          // CUDACHECK(cudaGetDevice(&cudaDev));
+          // if (cudaDev == 0) {
+          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i);
+          // }
+
+        }
+      }
+      pthread_mutex_unlock(&cbArgList[i].mutex);
+    }
+  }
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) {
+
+  size_t count = args->nbytes / wordSize(type);
+
+  Barrier(args);
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < iters; iter++) {
+
+    for (int miter = 0; miter < multi_iters; miter++) {
+      seenCqe[miter] = 0;
+      TESTCHECK(startColl(args, type, op, root, in_place,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+
+    TESTCHECK(completeColl(args));
+
+    int cudaDev;
+    cudaGetDevice(&cudaDev);
+    OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+  }
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec =
+      std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec / (iters * agg_iters *multi_iters);
+  if (cudaGraphLaunches >= 1)
+    deltaSec = deltaSec / cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw,
+                        args->nProcs * args->nThreads * args->nGpus);
+
+  Barrier(args);
+
+  ofcclDestroy(rankCtx);
+
+  double maxDelta = 0;
+  // static __thread int rep = 0; // 为了再次初始化buffer的参数，没用了。
+  // rep++;
+  if (datacheck) {
+
+    TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+    //aggregate delta from all threads and procs
+    Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec * 1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+    PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) {
+  int nranks = args->nProcs * args->nGpus * args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset,
+      recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount,
+                                   &sendInplaceOffset, &recvInplaceOffset,
+                                   (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
+                      const char *typeName, ncclRedOp_t op, const char *opName,
+                      int root, bool is_ofccl) {
+  // 首先创建ofcclRankCtx_t
+  int thrdCudaDev;
+  CUDACHECK(cudaGetDevice(&thrdCudaDev));
+  ofcclRankCtx_t rankCtx;
+  ofcclInitRankCtx(&rankCtx, thrdCudaDev);
+
+  // prepare for all size. op, type traversed in the caller.
+  // TODO: if we support multi size, each size should use a separate ncclComm
+
+  for (int miter = 0; miter < multi_iters; miter++) {
+    args->nbytes = sendBytesList[miter];
+    args->sendBytes = args->nbytes;
+    TESTCHECK(prepareColl(args, type, op, root, 0, miter/* iter * multi_iters + miter when iter=0 */, miter, rankCtx));
+  }
+
+  // 在这里完成check数据的准备；
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) { // 让init数据的kernel在启动daemonKernel之前执行。
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
+    
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev);
+  }
+  
+  // ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
+  ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
+
+  // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
+  // warmup还是需要开，不然ofccl性能拉胯。
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    for (int miter = 0; miter < multi_iters; miter++) {
+      args->nbytes = sendBytesList[miter];
+      args->sendBytes = args->nbytes;
+      seenCqe[miter] = 0;
+      TESTCHECK(startColl(args, type, op, root, 0,
+                          iter * multi_iters + miter, miter, rankCtx));
+    }
+    TESTCHECK(completeColl(args));
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), thrdCudaDev, iter, multi_iters);
+  }
+
+  print_line_header(max(args->sendBytes, args->expectedBytes),
+                    args->nbytes / wordSize(type), typeName, opName, root);
+  TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
+  // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边，所以没办法在这里通过调用两次BenchTime来先做out-of-place，再做in-place。像这样的话，可以在BenchTime里加个循环。
+  PRINT("\n");
+
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadRunTests");
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank * args->nThreads * args->nGpus +
+              args->thread * args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype,
+                                   test_typenames[nccltype],
+                                   (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs *args) {
+  // OFTEST_LOG1(TEST, "Enter threadInit");
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks = args->nProcs * args->nThreads * args->nGpus;
+
+  // set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+    int rank = args->proc * args->nThreads * args->nGpus +
+               args->thread * args->nGpus + i;
+    int gpuid = args->localRank * args->nThreads * args->nGpus +
+                args->thread * args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    // OFTEST_LOG1(TEST, "CommInitRank here");
+    NCCLCHECK(ncclCommInitRank(args->comms + i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i = 0; i < args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void *threadLauncher(void *thread_) {
+  struct testThread *thread = (struct testThread *)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread *thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff,
+                           size_t recvBytes, void **expected, size_t nbytes,
+                           int nranks) {
+  CUDACHECK(cudaMalloc(sendbuff, nbytes));
+  CUDACHECK(cudaMalloc(recvbuff, nbytes));
+  if (datacheck)
+    CUDACHECK(cudaMalloc(expected, recvBytes));
+  return testSuccess;
+}
+
+testResult_t AllocateBuffLists(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes) {
+  CUDACHECK(cudaMalloc(sendbuff, sendBytes));
+  CUDACHECK(cudaMalloc(recvbuff, recvBytes));
+  return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char *argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 4, 0)
+  ncclGetVersion(&test_ncclVersion);
+#else
+  test_ncclVersion = NCCL_VERSION_CODE;
+#endif
+// printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE,
+// test_ncclVersion);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 0, 0)
+  test_opnum = 4;
+  test_typenum = 9;
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 10, 0)) {
+    test_opnum++; // ncclAvg
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    test_typenum++; // bfloat16
+#endif
+  }
+  if (NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) &&
+      test_ncclVersion >= NCCL_VERSION(2, 11, 0)) {
+    test_opnum++; // PreMulSum
+  }
+#endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+      {"nthreads", required_argument, 0, 't'},
+      {"ngpus", required_argument, 0, 'g'},
+      {"minbytes", required_argument, 0, 'b'},
+      {"maxbytes", required_argument, 0, 'e'},
+      {"stepbytes", required_argument, 0, 'i'},
+      {"stepfactor", required_argument, 0, 'f'},
+      {"iters", required_argument, 0, 'n'},
+      {"agg_iters", required_argument, 0, 'm'},
+      {"multi_iters", required_argument, 0, 'M'},
+      {"warmup_iters", required_argument, 0, 'w'},
+      {"parallel_init", required_argument, 0, 'p'},
+      {"check", required_argument, 0, 'c'},
+      {"op", required_argument, 0, 'o'},
+      {"datatype", required_argument, 0, 'd'},
+      {"root", required_argument, 0, 'r'},
+      {"blocking", required_argument, 0, 'z'},
+      {"cudagraph", required_argument, 0, 'G'},
+      {"average", required_argument, 0, 'a'},
+      {"help", no_argument, 0, 'h'},
+      {}};
+
+  while (1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:M:m:w:p:c:o:d:r:z:hG:a:", longopts,
+                    &longindex);
+
+    if (c == -1)
+      break;
+
+    switch (c) {
+    case 't':
+      nThreads = strtol(optarg, NULL, 0);
+      break;
+    case 'g':
+      nGpus = strtol(optarg, NULL, 0);
+      break;
+    case 'b':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'minbytes'\n");
+        return -1;
+      }
+      minBytes = (size_t)parsed;
+      break;
+    case 'e':
+      parsed = parsesize(optarg);
+      if (parsed < 0) {
+        fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+        return -1;
+      }
+      maxBytes = (size_t)parsed;
+      break;
+    case 'i':
+      stepBytes = strtol(optarg, NULL, 0);
+      break;
+    case 'f':
+      stepFactor = strtol(optarg, NULL, 0);
+      break;
+    case 'n':
+      iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'M':
+      multi_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+      agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+      fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+      break;
+    case 'w':
+      warmup_iters = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'c':
+      datacheck = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'p':
+      parallel_init = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'o':
+      ncclop = ncclstringtoop(optarg);
+      break;
+    case 'd':
+      nccltype = ncclstringtotype(optarg);
+      break;
+    case 'r':
+      ncclroot = strtol(optarg, NULL, 0);
+      break;
+    case 'z':
+      blocking_coll = strtol(optarg, NULL, 0);
+      break;
+    case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) &&                \
+    CUDART_VERSION >= 11030
+      cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+      printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA "
+             "11.3. Ignoring\n");
+#endif
+      break;
+    case 'a':
+      average = (int)strtol(optarg, NULL, 0);
+      break;
+    case 'h':
+    default:
+      if (c != 'h')
+        printf("invalid option '%c'\n", c);
+      printf("USAGE: %s \n\t"
+             "[-t,--nthreads <num threads>] \n\t"
+             "[-g,--ngpus <gpus per thread>] \n\t"
+             "[-b,--minbytes <min size in bytes>] \n\t"
+             "[-e,--maxbytes <max size in bytes>] \n\t"
+             "[-i,--stepbytes <increment size>] \n\t"
+             "[-f,--stepfactor <increment factor>] \n\t"
+             "[-n,--iters <iteration count>] \n\t"
+             "[-m,--agg_iters <aggregated iteration count>] \n\t"
+             "[-M,--multi_iters <multi seprate ncclComm iteration count>] \n\t"
+             "[-w,--warmup_iters <warmup iteration count>] \n\t"
+             "[-p,--parallel_init <0/1>] \n\t"
+             "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+             "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+             "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+             "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+             "[-d,--datatype <nccltype/all>] \n\t"
+             "[-r,--root <root>] \n\t"
+             "[-z,--blocking <0/1>] \n\t"
+             "[-G,--cudagraph <num graph launches>] \n\t"
+             "[-a,--average <0/1/2/3> report average iteration time "
+             "<0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+             "[-h,--help]\n",
+             basename(argv[0]));
+      return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr,
+            "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+            (unsigned long long)minBytes, (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t),
+                MPI_BYTE, MPI_COMM_WORLD);
+  for (int p = 0; p < nProcs; p++) {
+    if (p == proc)
+      break;
+    if (hostHashs[p] == hostHashs[proc])
+      localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup "
+        "iters: %d iters: %d validation: %d \n",
+        nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1) ? stepFactor : stepBytes,
+        (stepFactor > 1) ? "factor" : "bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll)
+    PRINT("# Blocking Enabled: wait for completion and barrier after each "
+          "collective \n");
+  if (parallel_init)
+    PRINT("# Parallel Init Enabled: threads call into NcclInitRank "
+          "concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+  
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  if (multi_iters != 2) {
+    // TODO: he is only a baby T^T
+  OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters);
+  }
+  OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i = 0; i < nThreads * nGpus; i++) {
+    int cudaDev = localRank * nThreads * nGpus + i;
+    int rank = proc * nThreads * nGpus + i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len +=
+        snprintf(line + len, MAX_LINE - len,
+                 "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank,
+                 getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs * MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0,
+             MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines + MAX_LINE * p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for
+  // the rest.
+  size_t memMaxBytes = (maxMem - (1 << 30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0)
+      printf("#\n# Reducing maxBytes to %ld due to memory limitation\n",
+             maxBytes);
+  }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus * nThreads];
+  void *sendbuffs[nGpus * nThreads][MULTI_ITERS];
+  void *recvbuffs[nGpus * nThreads][MULTI_ITERS];
+  void *expected[nGpus * nThreads];
+  // size_t sendBytes, recvBytes;
+
+  // ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes,
+  //                            (size_t)nProcs * nGpus * nThreads);
+
+  ncclTestEngine.getCollByteCountList(sendBytesList, recvBytesList, countList, multi_iters);
+  // for (int i = 0; i < MULTI_ITERS; i++) {
+  //   OFTEST_LOG(TEST, "sendBytesList[%d] = %lu, recvBytesList[%d] = %lu", i, sendBytesList[i], i, recvBytesList[i]);
+  // }
+
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+    // 这里的调用是给每个线程分配。
+    // TESTCHECK(AllocateBuffs(sendbuffs + i, sendBytes, recvbuffs + i, recvBytes,
+    //                         expected + i, (size_t)maxBytes,
+    //                         nProcs * nThreads * nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams + i, cudaStreamNonBlocking));
+    
+    for (int j = 0; j < multi_iters; j++) {
+      AllocateBuffLists(&sendbuffs[i][j], sendBytesList[j], &recvbuffs[i][j], recvBytesList[j]);
+
+      // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, ALLOCATE sendbuff @ %p, recvbuff @ %p", i, j, sendbuffs[i][j], recvbuffs[i][j]);
+    }
+  }
+
+  // if parallel init is not selected, use main thread to initialize NCCL
+  // TODO: assign more comms when use multi size.
+  ncclComm_t *comms =
+      (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  ncclComm_t *adjusted_comms =
+    (ncclComm_t *)malloc(sizeof(ncclComm_t) * nThreads * nGpus * multi_iters);
+  if (!parallel_init) {
+    if (nProcs == 1) {
+      int gpuArray[nGpus * nThreads];
+      for (int i = 0; i < nGpus * nThreads; i++)
+        gpuArray[i] = i;
+      // OFTEST_LOG1(TEST, "CommInitAll here");
+      // use seprate comm
+      // TODO: we do not support MPI now.
+      for (int miter = 0; miter < multi_iters; miter++) {
+        NCCLCHECK(
+          ncclCommInitAll(comms + miter * nThreads * nGpus, nThreads * nGpus, gpuArray));
+        for (int tid = 0; tid < nThreads; tid++) {
+          memcpy(adjusted_comms + (tid * multi_iters + miter) * nGpus, comms + (miter * nThreads + tid) * nGpus, sizeof(ncclComm_t) * nGpus);
+        }
+      }
+      
+      // for (int miter = 0; miter < multi_iters; miter++) {
+      //   for (int tid = 0; tid < nThreads; tid++) {
+      //       OFTEST_LOG(TEST, "miter(%d), tid(%d), comm=%p", miter, tid, comms + (miter * nThreads + tid) * nGpus);
+      //   }
+      // }
+      // for (int tid = 0; tid < nThreads; tid++) {
+      //   for (int miter = 0; miter < multi_iters; miter++) {
+      //     OFTEST_LOG(TEST, "tid(%d), miter(%d), adjusted_comm=%p", tid, miter, adjusted_comms + (tid * multi_iters + miter) * nGpus);
+      //   }
+      // }
+    } else {
+      NCCLCHECK(ncclGroupStart());
+      for (int i = 0; i < nGpus * nThreads; i++) {
+        CUDACHECK(cudaSetDevice(localRank * nThreads * nGpus + i));
+        //  OFTEST_LOG1(TEST, "CommInitRank here");
+        NCCLCHECK(ncclCommInitRank(comms + i, nProcs * nThreads * nGpus, ncclId,
+                                   proc * nThreads * nGpus + i));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double *delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double) * nThreads * NUM_BLOCKS,
+                          cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t = 0; t < nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int *sync = (int *)calloc(2, sizeof(int));
+  int *barrier = (int *)calloc(2, sizeof(int));
+  double *reduce = (double *)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread) * nThreads);
+
+  for (int t = nThreads - 1; t >= 0; t--) {
+    threads[t].args.minbytes = minBytes;
+    threads[t].args.maxbytes = maxBytes;
+    // TODO: 不支持多个size。
+    if (minBytes != maxBytes) {
+      OFTEST_LOG1(TEST_FATAL, "Only supports single size now");
+      return testInternalError;
+    }
+    threads[t].args.stepbytes = stepBytes;
+    threads[t].args.stepfactor = stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs = nProcs;
+    threads[t].args.proc = proc;
+    threads[t].args.nThreads = nThreads;
+    threads[t].args.thread = t;
+    threads[t].args.nGpus = nGpus;
+    // threads[t].args.sendbuffs = sendbuffs[t];
+    // threads[t].args.recvbuffs = recvbuffs[t];
+    for (int j = 0; j < MULTI_ITERS; j++) {
+      threads[t].args.sendbuffs[j] = sendbuffs[t][j];
+      threads[t].args.recvbuffs[j] = recvbuffs[t][j];
+      // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, DISPATCH SRC sendbuff @ %p, recvbuff @ %p", t, j, sendbuffs[t][j], recvbuffs[t][j]);
+      // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, DISPATCH IN ARGS sendbuff @ %p, recvbuff @ %p", t, j, threads[t].args.sendbuffs[j], threads[t].args.recvbuffs[j]);
+    }
+    threads[t].args.expected = expected + t * nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms = adjusted_comms + t * multi_iters * nGpus;
+    // for (int i = 0; i < multi_iters * nGpus; i++) {
+    //   OFTEST_LOG(TEST, "tid(%d), multi_iters=%d, nGpus=%d, %dth comm=%p", t, multi_iters, nGpus, i, threads[t].args.comms+i);
+    // }
+
+    threads[t].args.streams = streams + t * nGpus;
+
+    threads[t].args.barrier = (volatile int *)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double *)reduce;
+    threads[t].args.sync = (volatile int *)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t * NUM_BLOCKS);
+    threads[t].args.errors = errors + t;
+    threads[t].args.bw = bw + t;
+    threads[t].args.bw_count = bw_count + t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads + t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t = nThreads - 1; t >= 0; t--) {
+    if (t)
+      pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for (int i = 0; i < nGpus * nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i = 0; i < nGpus * nThreads; i++) {
+    for (int j = 0; j < MULTI_ITERS; j++) {
+      CUDACHECK(cudaFree((char *)sendbuffs[i][j]));
+      CUDACHECK(cudaFree((char *)recvbuffs[i][j]));
+    }
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char *str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0],
+        errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0],
+        check_avg_bw == -1 ? ""
+                           : (bw[0] < check_avg_bw * (0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw * (0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h
new file mode 100644
index 0000000..c9a477d
--- /dev/null
+++ b/src_manual_size/common_ms.h
@@ -0,0 +1,292 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "nccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+
+// #define DEBUG_PRINT 1
+
+#define MULTI_ITERS 2
+
+#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
+#define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
+#define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__)
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+typedef struct {
+  int collId;
+  int gotCqe;
+  pthread_mutex_t mutex;
+} CallBackArgs;
+
+#define MAX_COLL_NUM 10000
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx);
+  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+  void (*getCollByteCountList)(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int localRank;
+  void* sendbuffs[MULTI_ITERS];
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void* recvbuffs[MULTI_ITERS];
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expected;
+  size_t expectedBytes;
+  volatile int* sync;
+  int sync_idx;
+  volatile int* barrier;
+  int barrier_idx;
+  volatile double* reduce;
+  int syncRank;
+  int syncNranks;
+  double* deltaHost;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+#include <chrono>
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root, bool is_ofccl=false);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+
+// Provided by each coll
+extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
+extern void print_header();
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64: 
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+#endif
diff --git a/src_manual_size/nccl1_compat.h b/src_manual_size/nccl1_compat.h
new file mode 100644
index 0000000..020a4bc
--- /dev/null
+++ b/src_manual_size/nccl1_compat.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu
new file mode 100644
index 0000000..d0fafb0
--- /dev/null
+++ b/src_manual_size/ofccl_all_reduce_ms.cu
@@ -0,0 +1,173 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common_ms.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error\n");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "\n");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+  OFTEST_LOG(TEST, "Hi <%lu> Rank<%d>, sendcount = %p, recvcount = %p, paramcount = %p, sendInplaceOffset = %p, recvInplaceOffset = %p, count = %lu, nranks = %d", pthread_self(), cudaDev, sendcount, recvcount, paramcount, sendInplaceOffset, recvInplaceOffset, count, nranks);
+
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+void AllReduceGetCollByteCountList(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen) { // listLen就等于multi_iter
+  // OFTEST_LOG1(TEST, "hi");
+  for (int i = 0; i < listLen; i++) {
+    *(sendCntList + i) = *(countList + i);
+    *(recvCntList + i) = *(countList + i);
+  }
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev);
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
+  return 0;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+
+  NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl,
+  AllReducePrepare
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
+    }
+  }
+  return testSuccess;
+}
+
+
+
+#pragma weak ncclTestEngine=allReduceEngine
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest,
+  AllReduceGetCollByteCountList
+};
\ No newline at end of file
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index b889947..5a0824a 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -885,7 +885,6 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs *args) {
 testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
                       const char *typeName, ncclRedOp_t op, const char *opName,
                       int root, bool is_ofccl) {
-  // if (is_ofccl) {
   // 首先创建ofcclRankCtx_t
   int thrdCudaDev;
   CUDACHECK(cudaGetDevice(&thrdCudaDev));
@@ -910,16 +909,11 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
     // Initialize sendbuffs, recvbuffs and expected
     TESTCHECK(args->collTest->initData(args, type, op, root, rep, 0));
     
-    // int cudaDev;
-    // CUDACHECK(cudaGetDevice(&cudaDev));
-    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), cudaDev);
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev);
   }
   
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
-  // ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
-  // }
+  // ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
+  ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
 
   // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
   // warmup还是需要开，不然ofccl性能拉胯。
@@ -931,25 +925,14 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
                           iter * multi_iters + miter, miter, rankCtx));
     }
     TESTCHECK(completeColl(args));
-    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), cudaDev, iter, multi_iters);
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth iter for %d colls", pthread_self(), thrdCudaDev, iter, multi_iters);
   }
 
-  // Benchmark
-  // for (size_t size = args->minbytes; size <= args->maxbytes;
-  //      size = ((args->stepfactor > 1) ? size * args->stepfactor
-  //                                     : size + args->stepbytes)) {
-  // setupArgs(size, type, args);
   print_line_header(max(args->sendBytes, args->expectedBytes),
                     args->nbytes / wordSize(type), typeName, opName, root);
   TESTCHECK(BenchTime(args, type, op, root, 0, rankCtx));
   // TESTCHECK(BenchTime(args, type, op, root, 1, rankCtx)); // 由于我们把ofcclDestroy挪到BenchTime里边，所以没办法在这里通过调用两次BenchTime来先做out-of-place，再做in-place。像这样的话，可以在BenchTime里加个循环。
   PRINT("\n");
-  // }
-
-  // if (is_ofccl) {
-  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclDestroy", pthread_self());
-  // ofcclDestroy(rankCtx); // 为了做check，把这个挪到BenchTime里边。
-  // }
 
   return testSuccess;
 }

From d6cad8e8521fc1edc6d59415053578012c4c9791 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Wed, 19 Oct 2022 08:07:14 +0000
Subject: [PATCH 044/109] adjust log

---
 src_inplace/common_inplace.cu          | 2 +-
 src_manual_size/common_ms.cu           | 2 +-
 src_manual_size/ofccl_all_reduce_ms.cu | 8 ++++----
 src_simple/common_simple.cu            | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src_inplace/common_inplace.cu b/src_inplace/common_inplace.cu
index 4cb08c3..22cfecb 100644
--- a/src_inplace/common_inplace.cu
+++ b/src_inplace/common_inplace.cu
@@ -816,7 +816,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
     int cudaDev;
     cudaGetDevice(&cudaDev);
-    OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+    OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
   }
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index f240087..08687bb 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -827,7 +827,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
     int cudaDev;
     cudaGetDevice(&cudaDev);
-    OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+    OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
   }
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu
index d0fafb0..2d925f3 100644
--- a/src_manual_size/ofccl_all_reduce_ms.cu
+++ b/src_manual_size/ofccl_all_reduce_ms.cu
@@ -84,10 +84,10 @@ int myCallback(int collIdFromCqe, void *args) {
   ((CallBackArgs *)args)->gotCqe = 1;
   pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
 
-  // int cudaDev;
-  // CUDACHECK(cudaGetDevice(&cudaDev));
-  // int collId = ((CallBackArgs *)args)->collId;
-  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  int collId = ((CallBackArgs *)args)->collId;
+  OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
   return 0;
 }
 
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 5a0824a..42cbe1c 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -816,7 +816,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
     int cudaDev;
     cudaGetDevice(&cudaDev);
-    OFTEST_LOG(TEST, "<%lu> rank=%d, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+    OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
   }
 
   auto delta = std::chrono::high_resolution_clock::now() - start;

From cec88ef5e2c75f11d4b343e0d6b158b59bfe0dd8 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 20 Oct 2022 05:28:04 +0000
Subject: [PATCH 045/109] + nccl_manual_size

---
 src_nccl_manual_size/Makefile              |  109 ++
 src_nccl_manual_size/all_reduce_nccl_ms.cu |  114 ++
 src_nccl_manual_size/common_nccl_ms.cu     | 1173 ++++++++++++++++++++
 src_nccl_manual_size/common_nccl_ms.h      |  275 +++++
 src_nccl_manual_size/nccl1_compat.h        |   50 +
 5 files changed, 1721 insertions(+)
 create mode 100644 src_nccl_manual_size/Makefile
 create mode 100644 src_nccl_manual_size/all_reduce_nccl_ms.cu
 create mode 100644 src_nccl_manual_size/common_nccl_ms.cu
 create mode 100644 src_nccl_manual_size/common_nccl_ms.h
 create mode 100644 src_nccl_manual_size/nccl1_compat.h

diff --git a/src_nccl_manual_size/Makefile b/src_nccl_manual_size/Makefile
new file mode 100644
index 0000000..4a67159
--- /dev/null
+++ b/src_nccl_manual_size/Makefile
@@ -0,0 +1,109 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 1
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC = $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+# ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+# NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_80,code=sm_80 \
+#                 -gencode=arch=compute_80,code=compute_80
+# else
+# NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                 -gencode=arch=compute_50,code=sm_50 \
+#                 -gencode=arch=compute_60,code=sm_60 \
+#                 -gencode=arch=compute_61,code=sm_61 \
+#                 -gencode=arch=compute_70,code=sm_70 \
+#                 -gencode=arch=compute_70,code=compute_70
+# endif
+
+CUDA_GENCODE_3080   = -gencode=arch=compute_86,code=sm_86
+CUDA_GENCODE_2080   = -gencode=arch=compute_75,code=sm_75
+
+CARDNAME ?= 3080
+ifeq ($(CARDNAME), 3080)
+NVCC_GENCODE ?= $(CUDA_GENCODE_3080) $(CUDA_PTX_INUSE)
+else
+NVCC_GENCODE ?= $(CUDA_GENCODE_2080) $(CUDA_PTX_INUSE)
+endif
+$(info CARDNAME $(CARDNAME))
+$(info NVCC_GENCODE $(NVCC_GENCODE))
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
+endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
+LIBRARIES += nccl
+NVLDFLAGS += $(LIBRARIES:%=-l%)
+
+$(info CARDNAME $(NVCUFLAGS))
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := all_reduce_nccl_ms
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+${DST_DIR}/%.o: %.cu common_nccl_ms.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common_nccl_ms.o
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/src_nccl_manual_size/all_reduce_nccl_ms.cu b/src_nccl_manual_size/all_reduce_nccl_ms.cu
new file mode 100644
index 0000000..95d7b28
--- /dev/null
+++ b/src_nccl_manual_size/all_reduce_nccl_ms.cu
@@ -0,0 +1,114 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common_nccl_ms.h"
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
+
+#pragma weak ncclTestEngine=allReduceEngine
diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu
new file mode 100644
index 0000000..f77cd48
--- /dev/null
+++ b/src_nccl_manual_size/common_nccl_ms.cu
@@ -0,0 +1,1173 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common_nccl_ms.h"
+#include <pthread.h>
+#include <cstdio>
+#include <getopt.h>
+#include <libgen.h>
+#include "cuda.h"
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+#if NCCL_MAJOR >= 2
+  ncclDataType_t test_types[ncclNumTypes] = {
+    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclBfloat16
+  #endif
+  };
+  const char *test_typenames[ncclNumTypes] = {
+    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , "bfloat16"
+  #endif
+  };
+  int test_typenum = -1;
+
+  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclAvg
+  #endif
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+  #endif
+  };
+  int test_opnum = -1;
+#else
+  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+  int test_typenum = 7;
+  const char *test_opnames[] = {"sum", "prod", "max", "min"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+  int test_opnum = 4;
+#endif
+
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32*1024*1024;
+static size_t maxBytes = 32*1024*1024;
+static size_t stepBytes = 1*1024*1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int cudaGraphLaunches = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static double parsesize(const char *value) {
+    long long int units;
+    double size;
+    char size_lit;
+
+    int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+    switch (count) {
+    case 2:
+      switch (size_lit) {
+      case 'G':
+      case 'g':
+        units = 1024*1024*1024;
+        break;
+      case 'M':
+      case 'm':
+        units = 1024*1024;
+        break;
+      case 'K':
+      case 'k':
+        units = 1024;
+        break;
+      default:
+        return -1.0;
+      };
+      break;
+    case 1:
+      units = 1;
+      break;
+    default:
+      return -1.0;
+    }
+
+    return size * units;
+}
+
+double DeltaMaxValue(ncclDataType_t type) {
+  switch(type) {
+    case ncclHalf: return 1e-2;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16: return 1e-2;
+#endif
+    case ncclFloat: return 1e-5;
+    case ncclDouble: return 1e-12;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+    //case ncclInt32:
+    case ncclUint32:
+#endif
+    case ncclInt64:
+    case ncclUint64: return 1e-200;
+  }
+  return 1e-200;
+}
+
+template<typename T> __device__
+double absDiff(T a, T b) {
+  return fabs((double)(b - a));
+}
+
+template<> __device__
+double absDiff<half>(half a, half b) {
+  float x = __half2float(a);
+  float y = __half2float(b);
+  return fabs((double)(y-x));
+}
+
+template<typename T> __device__
+float toFloat(T a) {
+  return (float)a;
+}
+template<> __device__
+float toFloat(half a) {
+  return __half2float(a);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<> __device__
+float toFloat(__nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+#endif
+
+template<typename T, int BSIZE> __global__
+void deltaKern(void* A_, void* B_, size_t count, double* max) {
+  const T* A = (const T*)A_;
+  const T* B = (const T*)B_;
+  __shared__ double temp[BSIZE];
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
+  double locmax = 0.0;
+  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
+
+    double delta = absDiff(A[i], B[i]);
+    if( delta > locmax ) {
+      locmax = delta;
+#ifdef DEBUG_PRINT
+      if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
+#endif
+    }
+  }
+
+  tid = threadIdx.x;
+  temp[tid] = locmax;
+  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
+    __syncthreads();
+    if( tid < stride )
+      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
+  }
+  __syncthreads();
+  if( threadIdx.x == 0)
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
+  switch (type) {
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+#endif
+    case ncclHalf:
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclFloat:
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclDouble:
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    case ncclUint8:
+#endif
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt:
+#if NCCL_MAJOR >= 2
+    case ncclUint32:
+#endif
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+    case ncclInt64:
+    case ncclUint64:
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
+  }
+  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
+  return testSuccess;
+}
+
+// For integer values, we use values between 0 and 255
+template<typename T>
+__device__ T testValue(const size_t offset, const int rep, const int rank) {
+  uint8_t v = (rep+rank+offset) % 256;
+  return (T)v;
+}
+
+// For floating point datatype, we use values between 0 and 1 otherwise the
+// Product operation will produce NaNs.
+template<>
+__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
+  return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
+}
+template<>
+__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
+  return __float2half(testValue<float>(offset, rep, rank));
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) {
+  return __float2bfloat16(testValue<float>(offset, rep, rank));
+}
+#endif
+
+// Operations
+template<typename T>
+__device__ T ncclOpSum(T a, T b) { return a+b; }
+template<typename T>
+__device__ T ncclOpProd(T a, T b) { return a*b; }
+template<typename T>
+__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
+template<typename T>
+__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
+
+// Definitions for half
+template<>
+__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
+template<>
+__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
+template<>
+__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
+template<>
+__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
+
+template<typename T>
+__device__ T ncclPPOpIdent(T x, int arg) { return x; }
+template<typename T>
+__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
+template<typename T>
+__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
+template<>
+__device__ half ncclPPOpMul(half x, int arg) {
+  return __float2half(__half2float(x)*float(arg));
+}
+template<>
+__device__ half ncclPPOpDiv(half x, int n) {
+  return __float2half(__half2float(x)/n);
+}
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) {
+  return __float2bfloat16(__bfloat162float(x)*float(arg));
+}
+template<>
+__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) {
+  return __float2bfloat16(__bfloat162float(x)/n);
+}
+#endif
+
+__host__ __device__ int preMulScalar(int rank) {
+  return 1 + rank%2;
+}
+
+template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
+__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
+    T val = testValue<T>(o+offset, rep, 0);
+    val = PreOp(val, preMulScalar(0));
+    for (int i=1; i<nranks; i++) {
+      T val1 = testValue<T>(o+offset, rep, i);
+      val1 = PreOp(val1, preMulScalar(i));
+      val = Op(val, val1);
+    }
+    data[o] = PostOp(val, nranks);
+  }
+}
+
+#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
+    KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
+#else
+  #define OPS(type) \
+    KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
+    KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
+#endif
+
+static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
+  OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  OPS(__nv_bfloat16)
+#endif
+};
+
+testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
+  CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+template<typename T>
+__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
+  for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
+    data[o] = testValue<T>(o, rep, rank);
+}
+
+static void* const initDataKerns[ncclNumTypes] = {
+  (void*)InitDataKernel<  int8_t>,
+  (void*)InitDataKernel< uint8_t>,
+  (void*)InitDataKernel< int32_t>,
+  (void*)InitDataKernel<uint32_t>,
+  (void*)InitDataKernel< int64_t>,
+  (void*)InitDataKernel<uint64_t>,
+  (void*)InitDataKernel<    half>,
+  (void*)InitDataKernel<   float>,
+  (void*)InitDataKernel<  double>,
+#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  (void*)InitDataKernel<__nv_bfloat16>
+#endif
+};
+
+template<typename T>
+testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
+  T* ptr = (T*)dest;
+  InitDataKernel<<<16, 512>>>(ptr, N, rep, rank);
+  return testSuccess;
+}
+
+testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
+  dim3 grid = { 32, 1, 1 };
+  dim3 block = { 256, 1, 1 };
+  void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
+  CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault));
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs* args) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  args->barrier_idx=!args->barrier_idx;
+}
+
+// Inter-thread/process barrier+allreduce
+void Allreduce(struct threadArgs* args, double* value, int average) {
+  while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
+  double val = *value;
+  if (args->thread > 0) {
+    double val2 = args->reduce[args->barrier_idx];
+    if (average == 1) val += val2;
+    if (average == 2) val = std::min(val, val2);
+    if (average == 3) val = std::max(val, val2);
+  }
+  if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
+  args->barrier[args->barrier_idx] = args->thread + 1;
+  if (args->thread+1 == args->nThreads) {
+#ifdef MPI_SUPPORT
+    if (average != 0) {
+      MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
+      MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
+    }
+#endif
+    if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
+    args->reduce[1-args->barrier_idx] = 0;
+    args->barrier[args->barrier_idx] = 0;
+  } else {
+    while (args->barrier[args->barrier_idx]) pthread_yield();
+  }
+  *value = args->reduce[args->barrier_idx];
+  args->barrier_idx=!args->barrier_idx;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
+  size_t count = args->expectedBytes/wordSize(type);
+  double maxDelta = 0.0;
+  for (int i=0; i<args->nGpus; i++) {
+    int device;
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
+    CUDACHECK(cudaSetDevice(device));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+    TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
+    maxDelta = std::max(*(args->deltaHost), maxDelta);
+
+#ifdef DEBUG_PRINT
+    if (rank == 0) {
+       int *expectedHost = (int *)malloc(args->expectedBytes);
+       int *dataHost = (int *)malloc(args->expectedBytes);
+
+       cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Expected: ");
+       for(int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, expectedHost[j]);
+       }
+       printf("\n");
+
+       cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+       printf("\n Actual: ");
+       for (int j=0; j<args->expectedBytes/sizeof(int); j++) {
+         printf("%d:%d ", j, dataHost[j]);
+       }
+       printf("\n");
+       free(expectedHost);
+       free(dataHost);
+    }
+#endif
+  }
+  double nranks = args->nProcs*args->nThreads*args->nGpus;
+  if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
+  *delta = maxDelta;
+  return testSuccess;
+}
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int* done = (int*)malloc(sizeof(int)*ngpus);
+  memset(done, 0, sizeof(int)*ngpus);
+  while (remaining) {
+   int idle = 1;
+   for (int i=0; i<ngpus; i++) {
+     if (done[i]) continue;
+
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
+       done[i] = 1;
+       remaining--;
+       idle = 0;
+       continue;
+     }
+
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
+       ncclResult_t ncclAsyncErr;
+       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+       if (ncclAsyncErr != ncclSuccess) {
+         // An asynchronous error happened. Stop the operation and destroy
+         // the communicator
+         for (int i=0; i<ngpus; i++)
+           NCCLCHECK(ncclCommAbort(comms[i]));
+         // Abort the perf test
+         NCCLCHECK(ncclAsyncErr);
+       }
+     }
+#endif
+   }
+
+   // We might want to let other threads (including NCCL threads) use the CPU.
+   if (idle) pthread_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+#ifndef NCCL_MAJOR
+    int cudaDev;
+    NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev));
+    CUDACHECK(cudaSetDevice(cudaDev));
+#endif
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
+    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    ncclRedOp_t op;
+
+    if(opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    else {
+      union {
+        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
+        half f16; float f32; double f64;
+        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+        #endif
+      };
+      int scalar = preMulScalar(rank);
+      switch(type) {
+      case ncclInt8: i8 = int8_t(scalar); break;
+      case ncclUint8: u8 = uint8_t(scalar); break;
+      case ncclInt32: i32 = int32_t(scalar); break;
+      case ncclUint32: u32 = uint32_t(scalar); break;
+      case ncclInt64: i64 = int32_t(scalar); break;
+      case ncclUint64: u64 = uint32_t(scalar); break;
+      case ncclFloat16: f16 = __float2half(float(scalar)); break;
+      case ncclFloat32: f32 = float(scalar); break;
+      case ncclFloat64: f64 = double(scalar); break;
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break;
+      #endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
+    }
+    #endif
+
+    TESTCHECK(args->collTest->runColl(
+          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
+          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
+        count, type, op, root, args->comms[i], args->streams[i]));
+
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    if(opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
+    }
+    #endif
+  }
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd());
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll) Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs* args) {
+  if (blocking_coll) return testSuccess;
+
+  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+  size_t count = args->nbytes / wordSize(type);
+  if (datacheck) { // 这里的目的应该是让测带宽跑的coll也使用非0数据。
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
+  }
+
+  // Sync
+  TESTCHECK(startColl(args, type, op, root, in_place, 0));
+  TESTCHECK(completeColl(args));
+
+  Barrier(args);
+
+#if CUDART_VERSION >= 11030
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
+  if (cudaGraphLaunches >= 1) {
+    // Begin cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      // Thread local mode is needed for:
+      // - Multi-thread mode
+      // - P2P pre-connect
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+    }
+  }
+#endif
+
+  // Performance Benchmark
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int iter = 0; iter < iters; iter++) {
+    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
+    for (int aiter = 0; aiter < agg_iters; aiter++) {
+      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
+    }
+    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
+  }
+
+#if CUDART_VERSION >= 11030
+  if (cudaGraphLaunches >= 1) {
+    // End cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+    }
+    // Instantiate cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+    }
+    // Resync CPU, restart timing, launch cuda graph
+    Barrier(args);
+    start = std::chrono::high_resolution_clock::now();
+    for (int l=0; l<cudaGraphLaunches; l++) {
+      for (int i=0; i<args->nGpus; i++) {
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+      }
+    }
+  }
+#endif
+
+  TESTCHECK(completeColl(args));
+
+  auto delta = std::chrono::high_resolution_clock::now() - start;
+  double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  deltaSec = deltaSec/(iters*agg_iters);
+  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+#if CUDART_VERSION >= 11030
+  if (cudaGraphLaunches >= 1) {
+    //destroy cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
+    }
+  }
+#endif
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+
+  Barrier(args);
+
+  double maxDelta = 0;
+  static __thread int rep = 0;
+  rep++;
+  if (datacheck) {
+      TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
+
+      //aggregate delta from all threads and procs
+      Allreduce(args, &maxDelta, 3);
+  }
+
+  double timeUsec = deltaSec*1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (datacheck) {
+     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+  } else {
+     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+  // Warm-up for large size
+  setupArgs(args->maxbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    TESTCHECK(startColl(args, type, op, root, 0, iter));
+  }
+  TESTCHECK(completeColl(args));
+
+  // Warm-up for small size
+  setupArgs(args->minbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    TESTCHECK(startColl(args, type, op, root, 0, iter));
+  }
+  TESTCHECK(completeColl(args));
+
+  // Benchmark
+  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+      setupArgs(size, type, args);
+      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+      TESTCHECK(BenchTime(args, type, op, root, 0));
+      // TODO: 实测是否恢复？
+      // TESTCHECK(BenchTime(args, type, op, root, 1));
+      PRINT("\n");
+  }
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs* args) {
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
+  CUDACHECK(cudaSetDevice(gpuid));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs* args) {
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks =  args->nProcs*args->nThreads*args->nGpus;
+
+  //set main thread again
+  is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i=0; i<args->nGpus; i++) {
+    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i=0; i<args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void* threadLauncher(void* thread_) {
+  struct testThread* thread = (struct testThread*)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread* thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
+    return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char* argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+    ncclGetVersion(&test_ncclVersion);
+  #else
+    test_ncclVersion = NCCL_VERSION_CODE;
+  #endif
+  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
+    test_opnum = 4;
+    test_typenum = 9;
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_opnum++; // ncclAvg
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+        test_typenum++; // bfloat16
+      #endif
+    }
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
+      test_opnum++; // PreMulSum
+    }
+  #endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+    {"nthreads", required_argument, 0, 't'},
+    {"ngpus", required_argument, 0, 'g'},
+    {"minbytes", required_argument, 0, 'b'},
+    {"maxbytes", required_argument, 0, 'e'},
+    {"stepbytes", required_argument, 0, 'i'},
+    {"stepfactor", required_argument, 0, 'f'},
+    {"iters", required_argument, 0, 'n'},
+    {"agg_iters", required_argument, 0, 'm'},
+    {"warmup_iters", required_argument, 0, 'w'},
+    {"parallel_init", required_argument, 0, 'p'},
+    {"check", required_argument, 0, 'c'},
+    {"op", required_argument, 0, 'o'},
+    {"datatype", required_argument, 0, 'd'},
+    {"root", required_argument, 0, 'r'},
+    {"blocking", required_argument, 0, 'z'},
+    {"cudagraph", required_argument, 0, 'G'},
+    {"average", required_argument, 0, 'a'},
+    {"help", no_argument, 0, 'h'},
+    {}
+  };
+
+  while(1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
+
+    if (c == -1)
+      break;
+
+    switch(c) {
+      case 't':
+        nThreads = strtol(optarg, NULL, 0);
+        break;
+      case 'g':
+        nGpus = strtol(optarg, NULL, 0);
+        break;
+      case 'b':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'minbytes'\n");
+          return -1;
+        }
+        minBytes = (size_t)parsed;
+        break;
+      case 'e':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+          return -1;
+        }
+        maxBytes = (size_t)parsed;
+        break;
+      case 'i':
+        stepBytes = strtol(optarg, NULL, 0);
+        break;
+      case 'f':
+        stepFactor = strtol(optarg, NULL, 0);
+        break;
+      case 'n':
+        iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+        agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+        break;
+      case 'w':
+        warmup_iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'c':
+        datacheck = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'p':
+        parallel_init = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'o':
+        ncclop = ncclstringtoop(optarg);
+        break;
+      case 'd':
+        nccltype = ncclstringtotype(optarg);
+        break;
+      case 'r':
+        ncclroot = strtol(optarg, NULL, 0);
+        break;
+      case 'z':
+        blocking_coll = strtol(optarg, NULL, 0);
+        break;
+      case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
+        cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+#endif
+        break;
+      case 'a':
+        average = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'h':
+      default:
+        if (c != 'h') printf("invalid option '%c'\n", c);
+        printf("USAGE: %s \n\t"
+            "[-t,--nthreads <num threads>] \n\t"
+            "[-g,--ngpus <gpus per thread>] \n\t"
+            "[-b,--minbytes <min size in bytes>] \n\t"
+            "[-e,--maxbytes <max size in bytes>] \n\t"
+            "[-i,--stepbytes <increment size>] \n\t"
+            "[-f,--stepfactor <increment factor>] \n\t"
+            "[-n,--iters <iteration count>] \n\t"
+            "[-m,--agg_iters <aggregated iteration count>] \n\t"
+            "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-p,--parallel_init <0/1>] \n\t"
+            "[-c,--check <0/1>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+            "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+            "[-d,--datatype <nccltype/all>] \n\t"
+            "[-r,--root <root>] \n\t"
+            "[-z,--blocking <0/1>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
+            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+            "[-h,--help]\n",
+	    basename(argv[0]));
+	return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+           (unsigned long long)minBytes,
+           (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int nProcs = 1, proc = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[nProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
+  for (int p=0; p<nProcs; p++) {
+    if (p == proc) break;
+    if (hostHashs[p] == hostHashs[proc]) localRank++;
+  }
+#endif
+  is_main_thread = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  for (int i=0; i<nThreads*nGpus; i++) {
+    int cudaDev = localRank*nThreads*nGpus+i;
+    int rank = proc*nThreads*nGpus+i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < nProcs; p++)
+      PRINT("%s", lines+MAX_LINE*p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
+  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+  }
+
+  ncclUniqueId ncclId;
+  if (proc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  cudaStream_t streams[nGpus*nThreads];
+  void* sendbuffs[nGpus*nThreads];
+  void* recvbuffs[nGpus*nThreads];
+  void* expected[nGpus*nThreads];
+  size_t sendBytes, recvBytes;
+
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+
+  for (int i=0; i<nGpus*nThreads; i++) {
+    CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
+    CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+  }
+
+  //if parallel init is not selected, use main thread to initialize NCCL
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+  if (!parallel_init) {
+     if (nProcs == 1) {
+       int gpuArray[nGpus*nThreads];
+       for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
+     } else {
+       NCCLCHECK(ncclGroupStart());
+       for (int i=0; i<nGpus*nThreads; i++) {
+         CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
+         NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
+       }
+       NCCLCHECK(ncclGroupEnd());
+     }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double* delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t=0; t<nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  PRINT("#\n");
+  print_header();
+
+  int* sync = (int*)calloc(2, sizeof(int));
+  int* barrier = (int*)calloc(2, sizeof(int));
+  double* reduce = (double*)calloc(2, sizeof(double));
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread)*nThreads);
+
+  for (int t=nThreads-1; t>=0; t--) {
+    threads[t].args.minbytes=minBytes;
+    threads[t].args.maxbytes=maxBytes;
+    threads[t].args.stepbytes=stepBytes;
+    threads[t].args.stepfactor=stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.nProcs=nProcs;
+    threads[t].args.proc=proc;
+    threads[t].args.nThreads=nThreads;
+    threads[t].args.thread=t;
+    threads[t].args.nGpus=nGpus;
+    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
+    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
+    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms=comms+t*nGpus;
+    threads[t].args.streams=streams+t*nGpus;
+
+    threads[t].args.barrier = (volatile int*)barrier;
+    threads[t].args.barrier_idx = 0;
+    threads[t].args.reduce = (volatile double*)reduce;
+    threads[t].args.sync = (volatile int*)sync;
+    threads[t].args.sync_idx = 0;
+    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
+    threads[t].args.errors=errors+t;
+    threads[t].args.bw=bw+t;
+    threads[t].args.bw_count=bw_count+t;
+
+    threads[t].args.reportErrors = 1;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads+t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t=nThreads-1; t>=0; t--) {
+    if (t) pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for(int i=0; i<nGpus*nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i=0; i<nGpus*nThreads; i++) {
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (datacheck) CUDACHECK(cudaFree(expected[i]));
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  char* str = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = str ? atof(str) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw*(0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/src_nccl_manual_size/common_nccl_ms.h b/src_nccl_manual_size/common_nccl_ms.h
new file mode 100644
index 0000000..bd84d01
--- /dev/null
+++ b/src_nccl_manual_size/common_nccl_ms.h
@@ -0,0 +1,275 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "nccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int localRank;
+  void** sendbuffs;
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void** recvbuffs;
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expected;
+  size_t expectedBytes;
+  volatile int* sync;
+  int sync_idx;
+  volatile int* barrier;
+  int barrier_idx;
+  volatile double* reduce;
+  int syncRank;
+  int syncNranks;
+  double* deltaHost;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+#include <chrono>
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+
+// Provided by each coll
+extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
+extern void print_header();
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64: 
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+#endif
diff --git a/src_nccl_manual_size/nccl1_compat.h b/src_nccl_manual_size/nccl1_compat.h
new file mode 100644
index 0000000..32f04e6
--- /dev/null
+++ b/src_nccl_manual_size/nccl1_compat.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <stdio.h>
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { printf("[%s:%d] <%s>\n", __FILE__, __LINE__, __func__); return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif

From 34f1b12ebaf341507665f30cf1a3d0bf2baa1c76 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 20 Oct 2022 05:58:08 +0000
Subject: [PATCH 046/109] nccl manual size seems ok

---
 src_nccl_manual_size/all_reduce_nccl_ms.cu |  11 +-
 src_nccl_manual_size/common_nccl_ms.cu     | 146 ++++++++-------------
 src_nccl_manual_size/common_nccl_ms.h      |   7 +-
 3 files changed, 68 insertions(+), 96 deletions(-)

diff --git a/src_nccl_manual_size/all_reduce_nccl_ms.cu b/src_nccl_manual_size/all_reduce_nccl_ms.cu
index 95d7b28..7bab5c2 100644
--- a/src_nccl_manual_size/all_reduce_nccl_ms.cu
+++ b/src_nccl_manual_size/all_reduce_nccl_ms.cu
@@ -27,6 +27,14 @@ void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
   *paramcount = *sendcount;
 }
 
+void AllReduceGetCollByteCountList(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen) { // listLen就等于agg_iters
+  // OFTEST_LOG1(TEST, "hi");
+  for (int i = 0; i < listLen; i++) {
+    *(sendCntList + i) = *(countList + i);
+    *(recvCntList + i) = *(countList + i);
+  }
+}
+
 testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
@@ -108,7 +116,8 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
 
 struct testEngine allReduceEngine = {
   AllReduceGetBuffSize,
-  AllReduceRunTest
+  AllReduceRunTest,
+  AllReduceGetCollByteCountList
 };
 
 #pragma weak ncclTestEngine=allReduceEngine
diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu
index f77cd48..2a8f6ec 100644
--- a/src_nccl_manual_size/common_nccl_ms.cu
+++ b/src_nccl_manual_size/common_nccl_ms.cu
@@ -13,6 +13,12 @@
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
+// TODO: 丑丑地搞个全局变量
+// size_t countList[AGG_ITERS] = {4000, 8192000};
+size_t countList[AGG_ITERS] = {4000, 8192000};
+size_t sendBytesList[AGG_ITERS];
+size_t recvBytesList[AGG_ITERS];
+
 #if NCCL_MAJOR >= 2
   ncclDataType_t test_types[ncclNumTypes] = {
     ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
@@ -59,7 +65,7 @@ static size_t stepFactor = 1;
 static int datacheck = 1;
 static int warmup_iters = 5;
 static int iters = 20;
-static int agg_iters = 1;
+static int agg_iters = AGG_ITERS;
 static int ncclop = ncclSum;
 static int nccltype = ncclFloat;
 static int ncclroot = 0;
@@ -512,10 +518,10 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t*
 testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
   size_t count = args->nbytes / wordSize(type);
 
-  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
-  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
-  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
-  size_t shift = totalnbytes * (iter % steps);
+  // // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+  // size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  // size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  // size_t shift = totalnbytes * (iter % steps);
 
   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
   for (int i = 0; i < args->nGpus; i++) {
@@ -525,8 +531,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     CUDACHECK(cudaSetDevice(cudaDev));
 #endif
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
-    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    char *recvBuff = (char *)(args->recvbuffs[iter]);
+    char *sendBuff = (char *)(args->sendbuffs[iter]);
     ncclRedOp_t op;
 
     if(opIndex < ncclNumOps) {
@@ -561,8 +567,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     #endif
 
     TESTCHECK(args->collTest->runColl(
-          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
-          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
+          (void*)(sendBuff),
+          (void*)(recvBuff),
         count, type, op, root, args->comms[i], args->streams[i]));
 
     #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
@@ -590,34 +596,14 @@ testResult_t completeColl(struct threadArgs* args) {
 
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
-  if (datacheck) { // 这里的目的应该是让测带宽跑的coll也使用非0数据。
-    // Initialize sendbuffs, recvbuffs and expected
-    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
-  }
-
-  // Sync
-  TESTCHECK(startColl(args, type, op, root, in_place, 0));
-  TESTCHECK(completeColl(args));
 
   Barrier(args);
 
-#if CUDART_VERSION >= 11030
-  cudaGraph_t graphs[args->nGpus];
-  cudaGraphExec_t graphExec[args->nGpus];
-  if (cudaGraphLaunches >= 1) {
-    // Begin cuda graph capture
-    for (int i=0; i<args->nGpus; i++) {
-      // Thread local mode is needed for:
-      // - Multi-thread mode
-      // - P2P pre-connect
-      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
-    }
-  }
-#endif
-
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
+    args->nbytes = sendBytesList[iter];
+    args->sendBytes = args->nbytes;
     if (agg_iters>1) NCCLCHECK(ncclGroupStart());
     for (int aiter = 0; aiter < agg_iters; aiter++) {
       TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
@@ -625,27 +611,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
   }
 
-#if CUDART_VERSION >= 11030
-  if (cudaGraphLaunches >= 1) {
-    // End cuda graph capture
-    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
-    }
-    // Instantiate cuda graph
-    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
-    }
-    // Resync CPU, restart timing, launch cuda graph
-    Barrier(args);
-    start = std::chrono::high_resolution_clock::now();
-    for (int l=0; l<cudaGraphLaunches; l++) {
-      for (int i=0; i<args->nGpus; i++) {
-        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
-      }
-    }
-  }
-#endif
-
   TESTCHECK(completeColl(args));
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
@@ -654,16 +619,6 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
   Allreduce(args, &deltaSec, average);
 
-#if CUDART_VERSION >= 11030
-  if (cudaGraphLaunches >= 1) {
-    //destroy cuda graph
-    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
-      CUDACHECK(cudaGraphDestroy(graphs[i]));
-    }
-  }
-#endif
-
   double algBw, busBw;
   args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
 
@@ -714,29 +669,16 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
 }
 
 testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
-  // Warm-up for large size
-  setupArgs(args->maxbytes, type, args);
-  for (int iter = 0; iter < warmup_iters; iter++) {
-    TESTCHECK(startColl(args, type, op, root, 0, iter));
-  }
-  TESTCHECK(completeColl(args));
-
-  // Warm-up for small size
-  setupArgs(args->minbytes, type, args);
-  for (int iter = 0; iter < warmup_iters; iter++) {
-    TESTCHECK(startColl(args, type, op, root, 0, iter));
-  }
-  TESTCHECK(completeColl(args));
 
   // Benchmark
-  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
-      setupArgs(size, type, args);
-      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
-      TESTCHECK(BenchTime(args, type, op, root, 0));
-      // TODO: 实测是否恢复？
-      // TESTCHECK(BenchTime(args, type, op, root, 1));
-      PRINT("\n");
-  }
+  args->nbytes = sendBytesList[0];
+  args->sendBytes = args->nbytes;
+  print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+  TESTCHECK(BenchTime(args, type, op, root, 0));
+  // TODO: 实测是否恢复？
+  // TESTCHECK(BenchTime(args, type, op, root, 1));
+  PRINT("\n");
+
   return testSuccess;
 }
 
@@ -965,6 +907,12 @@ int main(int argc, char* argv[]) {
   return 0;
 }
 
+testResult_t AllocateBuffLists(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes) {
+  CUDACHECK(cudaMalloc(sendbuff, sendBytes));
+  CUDACHECK(cudaMalloc(recvbuff, recvBytes));
+  return testSuccess;
+}
+
 testResult_t run() {
   int nProcs = 1, proc = 0;
   int localRank = 0;
@@ -1035,17 +983,24 @@ testResult_t run() {
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
   cudaStream_t streams[nGpus*nThreads];
-  void* sendbuffs[nGpus*nThreads];
-  void* recvbuffs[nGpus*nThreads];
+  void* sendbuffs[nGpus*nThreads][AGG_ITERS];
+  void* recvbuffs[nGpus*nThreads][AGG_ITERS];
   void* expected[nGpus*nThreads];
-  size_t sendBytes, recvBytes;
+  // size_t sendBytes, recvBytes;
+
+  // ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
 
-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);
+  ncclTestEngine.getCollByteCountList(sendBytesList, recvBytesList, countList, agg_iters);
 
   for (int i=0; i<nGpus*nThreads; i++) {
     CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
-    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
+    // TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
     CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+    for (int j = 0; j < agg_iters; j++) {
+      AllocateBuffLists(&sendbuffs[i][j], sendBytesList[j], &recvbuffs[i][j], recvBytesList[j]);
+
+      // OFTEST_LOG(TEST, "Rank<%d> coll_id = %d, ALLOCATE sendbuff @ %p, recvbuff @ %p", i, j, sendbuffs[i][j], recvbuffs[i][j]);
+    }
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL
@@ -1097,8 +1052,12 @@ testResult_t run() {
     threads[t].args.nThreads=nThreads;
     threads[t].args.thread=t;
     threads[t].args.nGpus=nGpus;
-    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
-    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
+    // threads[t].args.sendbuffs = sendbuffs+t*nGpus;
+    // threads[t].args.recvbuffs = recvbuffs+t*nGpus;
+    for (int j = 0; j < AGG_ITERS; j++) {
+      threads[t].args.sendbuffs[j] = sendbuffs[t][j];
+      threads[t].args.recvbuffs[j] = recvbuffs[t][j];
+    }
     threads[t].args.expected = expected+t*nGpus;
     threads[t].args.ncclId = ncclId;
     threads[t].args.comms=comms+t*nGpus;
@@ -1146,9 +1105,10 @@ testResult_t run() {
 
   // Free off CUDA allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
-    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
-    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
-    if (datacheck) CUDACHECK(cudaFree(expected[i]));
+    for (int j = 0; j < AGG_ITERS; j++) {
+      CUDACHECK(cudaFree((char *)sendbuffs[i][j]));
+      CUDACHECK(cudaFree((char *)recvbuffs[i][j]));
+    }
   }
   CUDACHECK(cudaFreeHost(delta));
 
diff --git a/src_nccl_manual_size/common_nccl_ms.h b/src_nccl_manual_size/common_nccl_ms.h
index bd84d01..f671630 100644
--- a/src_nccl_manual_size/common_nccl_ms.h
+++ b/src_nccl_manual_size/common_nccl_ms.h
@@ -16,6 +16,8 @@
 #include <pthread.h>
 #include "nccl1_compat.h"
 
+#define AGG_ITERS 2
+
 #define CUDACHECK(cmd) do {                         \
   cudaError_t err = cmd;                            \
   if( err != cudaSuccess ) {                        \
@@ -83,6 +85,7 @@ struct testEngine {
   void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
   testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
       const char* typeName, ncclRedOp_t op, const char* opName);
+  void (*getCollByteCountList)(size_t *sendCntList, size_t *recvCntList, const size_t *countList, int listLen);
 };
 
 extern struct testEngine ncclTestEngine;
@@ -100,10 +103,10 @@ struct threadArgs {
   int thread;
   int nGpus;
   int localRank;
-  void** sendbuffs;
+  void* sendbuffs[AGG_ITERS];
   size_t sendBytes;
   size_t sendInplaceOffset;
-  void** recvbuffs;
+  void* recvbuffs[AGG_ITERS];
   size_t recvInplaceOffset;
   ncclUniqueId ncclId;
   ncclComm_t* comms;

From c84dd891d6ffe1c68a08ee3336b7433eb3a1cd77 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 20 Oct 2022 06:10:42 +0000
Subject: [PATCH 047/109] fix manual size bug

---
 src_nccl_manual_size/common_nccl_ms.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu
index 2a8f6ec..8247baa 100644
--- a/src_nccl_manual_size/common_nccl_ms.cu
+++ b/src_nccl_manual_size/common_nccl_ms.cu
@@ -602,10 +602,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
-    args->nbytes = sendBytesList[iter];
-    args->sendBytes = args->nbytes;
     if (agg_iters>1) NCCLCHECK(ncclGroupStart());
     for (int aiter = 0; aiter < agg_iters; aiter++) {
+      args->nbytes = sendBytesList[aiter];
+      args->sendBytes = args->nbytes;
       TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
     }
     if (agg_iters>1) NCCLCHECK(ncclGroupEnd());

From 93668dc5db820d6a0aad51854d319eccd699f339 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 20 Oct 2022 06:33:13 +0000
Subject: [PATCH 048/109] non-homogeneous nccl manual size

---
 src_nccl_manual_size/common_nccl_ms.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu
index 8247baa..3867e40 100644
--- a/src_nccl_manual_size/common_nccl_ms.cu
+++ b/src_nccl_manual_size/common_nccl_ms.cu
@@ -18,6 +18,8 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 size_t countList[AGG_ITERS] = {4000, 8192000};
 size_t sendBytesList[AGG_ITERS];
 size_t recvBytesList[AGG_ITERS];
+// ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat};
+ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat};
 
 #if NCCL_MAJOR >= 2
   ncclDataType_t test_types[ncclNumTypes] = {
@@ -606,7 +608,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     for (int aiter = 0; aiter < agg_iters; aiter++) {
       args->nbytes = sendBytesList[aiter];
       args->sendBytes = args->nbytes;
-      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
+      TESTCHECK(startColl(args, typeList[aiter], op, root, in_place, iter*agg_iters+aiter));
     }
     if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
   }

From 71b40c7cbc21ea47e195187301faf0ee3c1f9da1 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 22 Oct 2022 12:34:19 +0000
Subject: [PATCH 049/109] + cudadev in cbArgs for ofccl manual size

---
 src_manual_size/common_ms.h            |  1 +
 src_manual_size/ofccl_all_reduce_ms.cu | 11 +++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h
index c9a477d..6b4285e 100644
--- a/src_manual_size/common_ms.h
+++ b/src_manual_size/common_ms.h
@@ -71,6 +71,7 @@ typedef enum {
 typedef struct {
   int collId;
   int gotCqe;
+  int cudaDev;
   pthread_mutex_t mutex;
 } CallBackArgs;
 
diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu
index 2d925f3..13ecc93 100644
--- a/src_manual_size/ofccl_all_reduce_ms.cu
+++ b/src_manual_size/ofccl_all_reduce_ms.cu
@@ -84,9 +84,12 @@ int myCallback(int collIdFromCqe, void *args) {
   ((CallBackArgs *)args)->gotCqe = 1;
   pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
 
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev)); // 这个函数之后在poller线程里调用的，所以这个获得的dev应该是不对的。
+
   int collId = ((CallBackArgs *)args)->collId;
+  int cudaDev = ((CallBackArgs *)args)->cudaDev;
+  
   OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
   return 0;
 }
@@ -95,13 +98,13 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
 
-  // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
   args->collId = collId;
   args->gotCqe = 0;
+  args->cudaDev = cudaDev;
   pthread_mutex_init(&args->mutex, NULL);
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
-  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d", pthread_self(), cudaDev, collId);
   // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
   
   return testSuccess;

From f2b285d3faa6fde7fe746847f09651a503df6771 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 24 Oct 2022 02:31:16 +0000
Subject: [PATCH 050/109] 161 maunal size from resnet

---
 src_manual_size/common_ms.cu | 2 +-
 src_manual_size/common_ms.h  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index 08687bb..d81e02a 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -17,7 +17,7 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 // TODO: 丑丑地搞个全局变量
 // size_t countList[MULTI_ITERS] = {4000, 8192000};
-size_t countList[MULTI_ITERS] = {4000, 8192000};
+size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632};
 size_t sendBytesList[MULTI_ITERS];
 size_t recvBytesList[MULTI_ITERS];
 
diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h
index 6b4285e..04332a8 100644
--- a/src_manual_size/common_ms.h
+++ b/src_manual_size/common_ms.h
@@ -18,7 +18,8 @@
 
 // #define DEBUG_PRINT 1
 
-#define MULTI_ITERS 2
+// #define MULTI_ITERS 2
+#define MULTI_ITERS 161
 
 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
 #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)

From a32587b25d86a5c3f4df86d557d9816913c9c9cb Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 24 Oct 2022 02:45:38 +0000
Subject: [PATCH 051/109] accurate damie

---
 src_manual_size/common_ms.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index d81e02a..a361657 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -1240,7 +1240,7 @@ testResult_t run() {
   
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
-  if (multi_iters != 2) {
+  if (multi_iters != MULTI_ITERS) {
     // TODO: he is only a baby T^T
   OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters);
   }

From 5d07bca66f6aab048068220ab765376feb36d067 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 24 Oct 2022 02:48:26 +0000
Subject: [PATCH 052/109] .

---
 src_manual_size/common_ms.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index a361657..6789d9f 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -1241,8 +1241,7 @@ testResult_t run() {
   int cudaDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
   if (multi_iters != MULTI_ITERS) {
-    // TODO: he is only a baby T^T
-  OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters);
+    OFTEST_LOG(TEST_FATAL, "<%lu> Rank<%d>, multi_iters = %d damie", pthread_self(), cudaDev, multi_iters);
   }
   OFTEST_LOG(TEST_INIT, "<%lu> Rank<%d>, multi_iters = %d", pthread_self(), cudaDev, multi_iters);
 #define MAX_LINE 2048

From 4f06775461b641f780739ca67695546d7c9d97a7 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 31 Oct 2022 11:41:08 +0000
Subject: [PATCH 053/109] aggressive no sync

---
 src_manual_size/common_ms.cu           |  7 +++++--
 src_manual_size/common_ms.h            |  1 +
 src_manual_size/ofccl_all_reduce_ms.cu | 10 +++++-----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index 6789d9f..7f4e36c 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -17,7 +17,8 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 // TODO: 丑丑地搞个全局变量
 // size_t countList[MULTI_ITERS] = {4000, 8192000};
-size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632};
+// size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632};
+size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096};
 size_t sendBytesList[MULTI_ITERS];
 size_t recvBytesList[MULTI_ITERS];
 
@@ -818,13 +819,15 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   for (int iter = 0; iter < iters; iter++) {
 
     for (int miter = 0; miter < multi_iters; miter++) {
-      seenCqe[miter] = 0;
+      seenCqe[miter] = 0; // TODO: 这样的写法或许不能保证“同步”，即现在的161个都跑完，才去启动下一波161个。
       TESTCHECK(startColl(args, type, op, root, in_place,
                           iter * multi_iters + miter, miter, rankCtx));
     }
 
     TESTCHECK(completeColl(args));
 
+    // usleep(100000);
+
     int cudaDev;
     cudaGetDevice(&cudaDev);
     OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h
index 04332a8..c780398 100644
--- a/src_manual_size/common_ms.h
+++ b/src_manual_size/common_ms.h
@@ -10,6 +10,7 @@
 #include <stdio.h>
 #include <cstdint>
 #include <algorithm>
+#include <unistd.h> // usleep
 #ifdef MPI_SUPPORT
 #include "mpi.h"
 #endif
diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu
index 13ecc93..74f4866 100644
--- a/src_manual_size/ofccl_all_reduce_ms.cu
+++ b/src_manual_size/ofccl_all_reduce_ms.cu
@@ -87,10 +87,10 @@ int myCallback(int collIdFromCqe, void *args) {
   // int cudaDev;
   // CUDACHECK(cudaGetDevice(&cudaDev)); // 这个函数之后在poller线程里调用的，所以这个获得的dev应该是不对的。
 
-  int collId = ((CallBackArgs *)args)->collId;
-  int cudaDev = ((CallBackArgs *)args)->cudaDev;
-  
-  OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
+  // int collId = ((CallBackArgs *)args)->collId;
+  // int cudaDev = ((CallBackArgs *)args)->cudaDev;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
+
   return 0;
 }
 
@@ -104,7 +104,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   pthread_mutex_init(&args->mutex, NULL);
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
-  OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d", pthread_self(), cudaDev, collId);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d", pthread_self(), cudaDev, collId);
   // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
   
   return testSuccess;

From 40fbb707108035832a058cd72925ba2ac58a9ed5 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 7 Nov 2022 14:45:58 +0000
Subject: [PATCH 054/109] a new permutation from oneflow

---
 src_manual_size/common_ms.cu | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index 7f4e36c..2fa7bda 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -19,6 +19,11 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 // size_t countList[MULTI_ITERS] = {4000, 8192000};
 // size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632};
 size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096};
+int idxList[2][MULTI_ITERS] = {
+  {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+  {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+  }
+};
 size_t sendBytesList[MULTI_ITERS];
 size_t recvBytesList[MULTI_ITERS];
 
@@ -810,6 +815,9 @@ testResult_t completeColl(struct threadArgs *args) {
 
 testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, ofcclRankCtx_t rankCtx) {
 
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+
   size_t count = args->nbytes / wordSize(type);
 
   Barrier(args);
@@ -817,8 +825,11 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
-
-    for (int miter = 0; miter < multi_iters; miter++) {
+    // 在这个地方改变miter的遍历顺序，起到乱序调用的作用。
+    for (int miter_idx = 0; miter_idx < multi_iters; miter_idx++) {
+      int miter = idxList[cudaDev][miter_idx];
+      // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke %dth startColl iter for coll_id = %d", pthread_self(), cudaDev, iter, miter);
+    // for (int miter = 0; miter < multi_iters; miter++) {
       seenCqe[miter] = 0; // TODO: 这样的写法或许不能保证“同步”，即现在的161个都跑完，才去启动下一波161个。
       TESTCHECK(startColl(args, type, op, root, in_place,
                           iter * multi_iters + miter, miter, rankCtx));
@@ -827,9 +838,6 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
     TESTCHECK(completeColl(args));
 
     // usleep(100000);
-
-    int cudaDev;
-    cudaGetDevice(&cudaDev);
     OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
   }
 

From e98b271cb988232762d99b4f96c31d620715eabd Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 12 Nov 2022 14:12:31 +0000
Subject: [PATCH 055/109] log

---
 src_manual_size/common_ms.cu | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index 2fa7bda..66f5bfc 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -785,7 +785,7 @@ testResult_t startColl(struct threadArgs *args, ncclDataType_t type,
   return testSuccess;
 }
 
-testResult_t completeColl(struct threadArgs *args) {
+testResult_t completeColl(struct threadArgs *args, int iter=0) {
   if (blocking_coll)
     return testSuccess;
     
@@ -799,10 +799,10 @@ testResult_t completeColl(struct threadArgs *args) {
           gotCqeCnt++;
           seenCqe[i] = 1;
           
-          // int cudaDev;
-          // CUDACHECK(cudaGetDevice(&cudaDev));
+          int cudaDev;
+          CUDACHECK(cudaGetDevice(&cudaDev));
           // if (cudaDev == 0) {
-          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get cqe for coll_id = %d", pthread_self(), cudaDev, i);
+          OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get %dth cqe for coll_id = %d", pthread_self(), cudaDev, iter, i);
           // }
 
         }
@@ -824,18 +824,17 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
-  for (int iter = 0; iter < iters; iter++) {
+  for (int iter = 1; iter <= iters; iter++) {
     // 在这个地方改变miter的遍历顺序，起到乱序调用的作用。
-    for (int miter_idx = 0; miter_idx < multi_iters; miter_idx++) {
+    for (int miter_idx = 0; miter_idx < multi_iters; miter_idx++) { // for (int miter = 0; miter < multi_iters; miter++) {
       int miter = idxList[cudaDev][miter_idx];
       // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke %dth startColl iter for coll_id = %d", pthread_self(), cudaDev, iter, miter);
-    // for (int miter = 0; miter < multi_iters; miter++) {
-      seenCqe[miter] = 0; // TODO: 这样的写法或许不能保证“同步”，即现在的161个都跑完，才去启动下一波161个。
+      seenCqe[miter] = 0;
       TESTCHECK(startColl(args, type, op, root, in_place,
                           iter * multi_iters + miter, miter, rankCtx));
     }
 
-    TESTCHECK(completeColl(args));
+    TESTCHECK(completeColl(args, iter));
 
     // usleep(100000);
     OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);

From 3598de42295060710109a751f69cde0e69747ab9 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 14 Nov 2022 02:52:01 +0000
Subject: [PATCH 056/109] suit 8 cards

---
 src_manual_size/common_ms.cu | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index 66f5bfc..1243e38 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -19,7 +19,16 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 // size_t countList[MULTI_ITERS] = {4000, 8192000};
 // size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632};
 size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096};
-int idxList[2][MULTI_ITERS] = {
+int idxList[8][MULTI_ITERS] = {
+  {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+  {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+  },
+  {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+  {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+  },
+  {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+  {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+  },
   {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
   {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
   }
@@ -799,11 +808,9 @@ testResult_t completeColl(struct threadArgs *args, int iter=0) {
           gotCqeCnt++;
           seenCqe[i] = 1;
           
-          int cudaDev;
-          CUDACHECK(cudaGetDevice(&cudaDev));
-          // if (cudaDev == 0) {
-          OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get %dth cqe for coll_id = %d", pthread_self(), cudaDev, iter, i);
-          // }
+          // int cudaDev;
+          // CUDACHECK(cudaGetDevice(&cudaDev));
+          // OFTEST_LOG(TEST, "<%lu> Rank<%d>, completeColl get %dth cqe for coll_id = %d", pthread_self(), cudaDev, iter, i);
 
         }
       }

From bcf3b874b1a879079e21c6d77a089b1eab57667d Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 18 Nov 2022 05:21:49 +0000
Subject: [PATCH 057/109] use prepareDone

---
 src_simple/common_simple.cu    | 10 +++++-----
 src_simple/ofccl_all_reduce.cu | 11 ++++++-----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 42cbe1c..4bc04bb 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -814,9 +814,9 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
     TESTCHECK(completeColl(args));
 
-    int cudaDev;
-    cudaGetDevice(&cudaDev);
-    OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+    // int cudaDev;
+    // cudaGetDevice(&cudaDev);
+    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
   }
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
@@ -912,8 +912,8 @@ testResult_t TimeTest(struct threadArgs *args, ncclDataType_t type,
     // OFTEST_LOG(TEST, "<%lu> Rank<%d>, initData OK", pthread_self(), thrdCudaDev);
   }
   
-  // ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
-  ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
+  ofcclPrepareDone(rankCtx); // TODO: 测性能的时候保持这里，cheat一下，省下启动kernel的时间。同时配合ofccl里，不要激进地主动退出。
+  // ofcclFinalizeRankCtx7StartHostThrds(rankCtx);
 
   // TODO: if we support multi size, 我们可以对所有size都warm up；或者保留现在的方式，但是要保证选取了正确的comm。
   // warmup还是需要开，不然ofccl性能拉胯。
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 42c9628..50aaad8 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -36,9 +36,6 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
   size_t recvcount = args->expectedBytes / wordSize(type);
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-
   for (int i=0; i<args->nGpus; i++) {
     int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
     CUDACHECK(cudaSetDevice(gpuid));
@@ -49,6 +46,9 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
     CUDACHECK(cudaDeviceSynchronize());
   }
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
   // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done AllReduceInitData", pthread_self(), cudaDev);
   return testSuccess;
 }
@@ -80,8 +80,6 @@ int myCallback(int collIdFromCqe, void *args) {
 }
 
 testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
 
   // CallBackArgs *args = (CallBackArgs *)malloc(sizeof(CallBackArgs));
   args->collId = collId;
@@ -89,6 +87,9 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   pthread_mutex_init(&args->mutex, NULL);
 
   NCCLCHECK(ofcclRunAllReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
   // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
   // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
   

From 0ccfcc996b368407a8ba7eb647f3a8d54d98bdae Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 20 Nov 2022 13:36:00 +0000
Subject: [PATCH 058/109] nccl ms different order

---
 src_nccl_manual_size/common_nccl_ms.cu | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src_nccl_manual_size/common_nccl_ms.cu b/src_nccl_manual_size/common_nccl_ms.cu
index 3867e40..e4449e1 100644
--- a/src_nccl_manual_size/common_nccl_ms.cu
+++ b/src_nccl_manual_size/common_nccl_ms.cu
@@ -20,6 +20,10 @@ size_t sendBytesList[AGG_ITERS];
 size_t recvBytesList[AGG_ITERS];
 // ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat};
 ncclDataType_t typeList[AGG_ITERS] = {ncclInt32, ncclFloat};
+int idxList[8][AGG_ITERS] = {
+  {0, 1},
+  {1, 0}
+};
 
 #if NCCL_MAJOR >= 2
   ncclDataType_t test_types[ncclNumTypes] = {
@@ -598,6 +602,8 @@ testResult_t completeColl(struct threadArgs* args) {
 
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
 
   Barrier(args);
 
@@ -605,7 +611,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
     if (agg_iters>1) NCCLCHECK(ncclGroupStart());
-    for (int aiter = 0; aiter < agg_iters; aiter++) {
+    // for (int aiter = 0; aiter < agg_iters; aiter++) {
+    for (int aiter_idx = 0; aiter_idx < agg_iters; aiter_idx++) {
+      int aiter = idxList[cudaDev][aiter_idx];
       args->nbytes = sendBytesList[aiter];
       args->sendBytes = args->nbytes;
       TESTCHECK(startColl(args, typeList[aiter], op, root, in_place, iter*agg_iters+aiter));

From 2b19a59a9e4a284adf1ed9de0d8e24c63fe251a7 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 26 Nov 2022 05:34:52 +0000
Subject: [PATCH 059/109] usleep

---
 src_manual_size/common_ms.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index 1243e38..f8ad9a1 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -837,13 +837,14 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
       int miter = idxList[cudaDev][miter_idx];
       // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke %dth startColl iter for coll_id = %d", pthread_self(), cudaDev, iter, miter);
       seenCqe[miter] = 0;
+      usleep(200);
       TESTCHECK(startColl(args, type, op, root, in_place,
                           iter * multi_iters + miter, miter, rankCtx));
     }
 
     TESTCHECK(completeColl(args, iter));
 
-    // usleep(100000);
+    usleep(100000);
     OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
   }
 

From b3b632348a9ce03339cb5e548b5b12e6800c50f9 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 26 Nov 2022 14:02:10 +0000
Subject: [PATCH 060/109] + ofccl_test.sh

---
 ofccl_test.sh                | 60 ++++++++++++++++++++++++++++++++++++
 src_manual_size/common_ms.cu |  2 +-
 2 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 ofccl_test.sh

diff --git a/ofccl_test.sh b/ofccl_test.sh
new file mode 100644
index 0000000..f1930be
--- /dev/null
+++ b/ofccl_test.sh
@@ -0,0 +1,60 @@
+clear
+
+cd /home/panlichen/work2/nccl-tests
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
+export MY_NUM_DEV=2
+# export CUDA_VISIBLE_DEVICES=0,1,4,5
+export SHOW_ALL_PREPARED_COLL=0
+export NITER=4
+export NBYTES=8K
+export WARMITER=2
+export MITER=4
+
+export TRAVERSE_TIMES=10
+export TOLERANT_FAIL_CHECK_SQ_CNT=500
+export CNT_BEFORE_QUIT=5
+export TOLERANT_UNPROGRESSED_CNT=50000
+export BASE_CTX_SWITCH_THRESHOLD=100
+
+echo TRAVERSE_TIMES=$TRAVERSE_TIMES
+echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT
+echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT
+echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
+echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
+
+if [ -z $BINARY ];then
+    BINARY="NORMAL"
+    BINARY="MS"
+fi
+
+if [ "$BINARY" == "NORMAL" ];then
+    target="./build/ofccl_all_reduce_perf"
+elif [ "$BINARY" == "MS" ];then
+    target="./build/ofccl_all_reduce_ms_perf"
+    export NITER=200
+    export MY_NUM_DEV=8
+    export SHOW_ALL_PREPARED_COLL=1
+    export WARMITER=0
+fi
+
+
+if [ -z $RUN_TYPE ];then
+    RUN_TYPE="PURE"
+fi
+
+if [ "$RUN_TYPE" == "PURE" ];then
+    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c 0 -M $MITER"
+elif [ "$RUN_TYPE" == "GDB" ];then
+    cmd="cuda-gdb $target"
+elif [ "$RUN_TYPE" == "NSYS" ];then
+    cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b 64M -e 64M -f 2 -t $MY_NUM_DEV -g 1 -n 1 -w 0 -c 0"
+fi
+
+echo cmd=$cmd
+$cmd
+
diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index f8ad9a1..e54bb30 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -1129,7 +1129,7 @@ int main(int argc, char *argv[]) {
       iters = (int)strtol(optarg, NULL, 0);
       break;
     case 'M':
-      multi_iters = (int)strtol(optarg, NULL, 0);
+      // multi_iters = (int)strtol(optarg, NULL, 0);
       break;
     case 'm':
 #if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)

From e2bfe2e80b6293ee7f50cfa0910e4e0069db91db Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Tue, 29 Nov 2022 16:55:20 +0000
Subject: [PATCH 061/109] scripts

---
 ofccl_test.sh | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index f1930be..c9c898d 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -7,19 +7,14 @@ export NCCL_ALGO=Ring
 # export NCCL_MAX_NCHANNELS=1
 # export NCCL_MIN_NCHANNELS=1
 # export NCCL_NTHREADS=64
-export MY_NUM_DEV=2
-# export CUDA_VISIBLE_DEVICES=0,1,4,5
-export SHOW_ALL_PREPARED_COLL=0
-export NITER=4
-export NBYTES=8K
-export WARMITER=2
-export MITER=4
+
+export CHECK=0
 
 export TRAVERSE_TIMES=10
-export TOLERANT_FAIL_CHECK_SQ_CNT=500
+export TOLERANT_FAIL_CHECK_SQ_CNT=5000
 export CNT_BEFORE_QUIT=5
 export TOLERANT_UNPROGRESSED_CNT=50000
-export BASE_CTX_SWITCH_THRESHOLD=100
+export BASE_CTX_SWITCH_THRESHOLD=80
 
 echo TRAVERSE_TIMES=$TRAVERSE_TIMES
 echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT
@@ -28,18 +23,38 @@ echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
 echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
 
 if [ -z $BINARY ];then
-    BINARY="NORMAL"
+    BINARY="DEBUG"
     BINARY="MS"
+    BINARY="PERF"
 fi
 
-if [ "$BINARY" == "NORMAL" ];then
+if [ "$BINARY" == "DEBUG" ];then
+    target="./build/ofccl_all_reduce_perf"
+    export MY_NUM_DEV=8
+    # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    export SHOW_ALL_PREPARED_COLL=1
+    export NITER=4
+    export NBYTES=8K
+    export WARMITER=2
+    export MITER=4
+elif [ "$BINARY" == "PERF" ];then
     target="./build/ofccl_all_reduce_perf"
+    export MY_NUM_DEV=2
+    export CUDA_VISIBLE_DEVICES=0,1,4,5
+    export SHOW_ALL_PREPARED_COLL=0
+    export NITER=4
+    export NBYTES=8K
+    export WARMITER=2
+    export MITER=4
 elif [ "$BINARY" == "MS" ];then
     target="./build/ofccl_all_reduce_ms_perf"
-    export NITER=200
     export MY_NUM_DEV=8
+    # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    export NITER=200
     export SHOW_ALL_PREPARED_COLL=1
     export WARMITER=0
+    export NBYTES=8K
+    export MITER=4
 fi
 
 
@@ -48,7 +63,7 @@ if [ -z $RUN_TYPE ];then
 fi
 
 if [ "$RUN_TYPE" == "PURE" ];then
-    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c 0 -M $MITER"
+    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
 elif [ "$RUN_TYPE" == "GDB" ];then
     cmd="cuda-gdb $target"
 elif [ "$RUN_TYPE" == "NSYS" ];then
@@ -56,5 +71,5 @@ elif [ "$RUN_TYPE" == "NSYS" ];then
 fi
 
 echo cmd=$cmd
-$cmd
+$cmd #> /home/panlichen/work2/ofccl/log/ofccl.log
 

From 34cd2754d2ef06c57e648b3835e784007b2c837e Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Wed, 30 Nov 2022 06:10:33 +0000
Subject: [PATCH 062/109] script

---
 ofccl_test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index c9c898d..9dd6ff5 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -39,8 +39,8 @@ if [ "$BINARY" == "DEBUG" ];then
     export MITER=4
 elif [ "$BINARY" == "PERF" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=2
-    export CUDA_VISIBLE_DEVICES=0,1,4,5
+    export MY_NUM_DEV=8
+    # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=4
     export NBYTES=8K

From 4e8162026d8485260dd4d2218a09828d1c668936 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 1 Dec 2022 15:38:33 +0000
Subject: [PATCH 063/109] scripts

---
 ofccl_test.sh | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 9dd6ff5..7cdf163 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -11,21 +11,26 @@ export NCCL_ALGO=Ring
 export CHECK=0
 
 export TRAVERSE_TIMES=10
-export TOLERANT_FAIL_CHECK_SQ_CNT=5000
-export CNT_BEFORE_QUIT=5
-export TOLERANT_UNPROGRESSED_CNT=50000
+export TOLERANT_UNPROGRESSED_CNT=8000
 export BASE_CTX_SWITCH_THRESHOLD=80
 
+# export ENABLE_VQ=1
+# export TOLERANT_FAIL_CHECK_SQ_CNT=5000
+# export CNT_BEFORE_QUIT=5
+
 echo TRAVERSE_TIMES=$TRAVERSE_TIMES
-echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT
-echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT
 echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
 echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
 
+if [ ! -z $BINARY ];then
+    echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT
+    echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT
+fi
+
 if [ -z $BINARY ];then
     BINARY="DEBUG"
     BINARY="MS"
-    BINARY="PERF"
+    # BINARY="PERF"
 fi
 
 if [ "$BINARY" == "DEBUG" ];then

From 0c3718e300f0c4e3108b14bc131c66671ba2f657 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 5 Dec 2022 08:05:38 +0000
Subject: [PATCH 064/109] scripts

---
 ofccl_test.sh | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 7cdf163..b0384ee 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -4,15 +4,16 @@ cd /home/panlichen/work2/nccl-tests
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
 export NCCL_PROTO=Simple
 export NCCL_ALGO=Ring
-# export NCCL_MAX_NCHANNELS=1
-# export NCCL_MIN_NCHANNELS=1
-# export NCCL_NTHREADS=64
+export NCCL_MAX_NCHANNELS=1
+export NCCL_MIN_NCHANNELS=1
+export NCCL_NTHREADS=64
 
 export CHECK=0
 
 export TRAVERSE_TIMES=10
-export TOLERANT_UNPROGRESSED_CNT=8000
+export TOLERANT_UNPROGRESSED_CNT=10000
 export BASE_CTX_SWITCH_THRESHOLD=80
+export BOUNS_SWITCH_4_PROCESSED_COLL=100
 
 # export ENABLE_VQ=1
 # export TOLERANT_FAIL_CHECK_SQ_CNT=5000
@@ -21,6 +22,7 @@ export BASE_CTX_SWITCH_THRESHOLD=80
 echo TRAVERSE_TIMES=$TRAVERSE_TIMES
 echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
 echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
+echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL
 
 if [ ! -z $BINARY ];then
     echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT
@@ -39,7 +41,7 @@ if [ "$BINARY" == "DEBUG" ];then
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=1
     export NITER=4
-    export NBYTES=8K
+    export NBYTES=1G
     export WARMITER=2
     export MITER=4
 elif [ "$BINARY" == "PERF" ];then

From dba5947486b454affa89e37149f6bf01d75f3250 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Wed, 7 Dec 2022 06:00:07 +0000
Subject: [PATCH 065/109] scripts

---
 ofccl_test.sh | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index b0384ee..b0b3452 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -1,12 +1,15 @@
 clear
 
+export DEBUG_CC=0
+export DEBUG_ENQ=0
+
 cd /home/panlichen/work2/nccl-tests
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
 export NCCL_PROTO=Simple
 export NCCL_ALGO=Ring
-export NCCL_MAX_NCHANNELS=1
-export NCCL_MIN_NCHANNELS=1
-export NCCL_NTHREADS=64
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
 
 export CHECK=0
 
@@ -40,10 +43,11 @@ if [ "$BINARY" == "DEBUG" ];then
     export MY_NUM_DEV=8
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=1
-    export NITER=4
+    export NITER=10
     export NBYTES=1G
-    export WARMITER=2
+    export WARMITER=0
     export MITER=4
+    export CHECK=0
 elif [ "$BINARY" == "PERF" ];then
     target="./build/ofccl_all_reduce_perf"
     export MY_NUM_DEV=8
@@ -67,12 +71,14 @@ fi
 
 if [ -z $RUN_TYPE ];then
     RUN_TYPE="PURE"
+    # RUN_TYPE="GDB"
 fi
 
 if [ "$RUN_TYPE" == "PURE" ];then
     cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
 elif [ "$RUN_TYPE" == "GDB" ];then
     cmd="cuda-gdb $target"
+    # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0
 elif [ "$RUN_TYPE" == "NSYS" ];then
     cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b 64M -e 64M -f 2 -t $MY_NUM_DEV -g 1 -n 1 -w 0 -c 0"
 fi

From 10fefc6110ce68e96d0158196577810df2382c2b Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 8 Dec 2022 08:34:36 +0000
Subject: [PATCH 066/109] scripts

---
 ofccl_test.sh | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index b0b3452..e66c4e5 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -16,7 +16,8 @@ export CHECK=0
 export TRAVERSE_TIMES=10
 export TOLERANT_UNPROGRESSED_CNT=10000
 export BASE_CTX_SWITCH_THRESHOLD=80
-export BOUNS_SWITCH_4_PROCESSED_COLL=100
+export BOUNS_SWITCH_4_PROCESSED_COLL=0
+export DEV_TRY_ROUND=10
 
 # export ENABLE_VQ=1
 # export TOLERANT_FAIL_CHECK_SQ_CNT=5000
@@ -26,6 +27,7 @@ echo TRAVERSE_TIMES=$TRAVERSE_TIMES
 echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
 echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
 echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL
+echo DEV_TRY_ROUND=$DEV_TRY_ROUND
 
 if [ ! -z $BINARY ];then
     echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT
@@ -40,13 +42,13 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=8
+    export MY_NUM_DEV=2
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=1
-    export NITER=10
-    export NBYTES=1G
+    export NITER=1
+    export NBYTES=1M
     export WARMITER=0
-    export MITER=4
+    export MITER=1
     export CHECK=0
 elif [ "$BINARY" == "PERF" ];then
     target="./build/ofccl_all_reduce_perf"

From 7a79d985070a2e9c5c170f53fc79cd2a3bc5d7d4 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 9 Dec 2022 16:37:18 +0000
Subject: [PATCH 067/109] little ms

---
 ofccl_test.sh                | 20 ++++-----
 src_manual_size/common_ms.cu | 80 ++++++++++++++++++++++++++++--------
 src_manual_size/common_ms.h  | 10 ++++-
 3 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index e66c4e5..68cc75b 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -7,9 +7,9 @@ cd /home/panlichen/work2/nccl-tests
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
 export NCCL_PROTO=Simple
 export NCCL_ALGO=Ring
-# export NCCL_MAX_NCHANNELS=1
-# export NCCL_MIN_NCHANNELS=1
-# export NCCL_NTHREADS=64
+export NCCL_MAX_NCHANNELS=1
+export NCCL_MIN_NCHANNELS=1
+export NCCL_NTHREADS=64
 
 export CHECK=0
 
@@ -36,19 +36,19 @@ fi
 
 if [ -z $BINARY ];then
     BINARY="DEBUG"
-    BINARY="MS"
+    # BINARY="MS"
     # BINARY="PERF"
 fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=2
+    export MY_NUM_DEV=8
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
-    export SHOW_ALL_PREPARED_COLL=1
-    export NITER=1
-    export NBYTES=1M
+    export SHOW_ALL_PREPARED_COLL=0
+    export NITER=40
+    export NBYTES=128M
     export WARMITER=0
-    export MITER=1
+    export MITER=2
     export CHECK=0
 elif [ "$BINARY" == "PERF" ];then
     target="./build/ofccl_all_reduce_perf"
@@ -86,5 +86,5 @@ elif [ "$RUN_TYPE" == "NSYS" ];then
 fi
 
 echo cmd=$cmd
-$cmd #> /home/panlichen/work2/ofccl/log/ofccl.log
+$cmd > /home/panlichen/work2/ofccl/log/ofccl.log
 
diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index e54bb30..0ed1041 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -15,24 +15,68 @@
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
-// TODO: 丑丑地搞个全局变量
-// size_t countList[MULTI_ITERS] = {4000, 8192000};
-// size_t countList[MULTI_ITERS] = {4000, 8192000, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 4194304, 2048, 2048, 9437184, 2048, 2048, 4194304, 8192, 8192, 8192, 8192, 4194304, 8388608, 2048, 2048, 9437184, 2048, 2048, 2097152, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 1048576, 1024, 1024, 2359296, 1024, 1024, 1048576, 4096, 4096, 4096, 4096, 1048576, 2097152, 1024, 1024, 2359296, 1024, 1024, 524288, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 262144, 512, 512, 589824, 512, 512, 262144, 2048, 2048, 2048, 2048, 262144, 524288, 512, 512, 589824, 512, 512, 131072, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 65536, 256, 256, 147456, 256, 256, 65536, 1024, 1024, 1024, 1024, 65536, 65536, 256, 256, 147456, 256, 256, 16384, 256, 256, 37632};
-size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096};
-int idxList[8][MULTI_ITERS] = {
-  {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
-  {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
-  },
-  {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
-  {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
-  },
-  {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
-  {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
-  },
-  {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
-  {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
-  }
-};
+#ifdef FULL_MS
+  size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096};
+  int idxList[8][MULTI_ITERS] = {
+    {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+    {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+    },
+    {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+    {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+    },
+    {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+    {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+    },
+    {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+    {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+    }
+  };
+#else
+  // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512};
+  // size_t idxList[8][MULTI_ITERS] = {
+  //   {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+  //   {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15},
+  //   {4, 5, 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+  //   {0, 1, 2, 3, 8, 4, 5, 6, 9, 10, 11, 7, 12, 13, 14, 15},
+  //   {0, 1, 2, 3, 8, 4, 5, 6, 9, 10, 11, 7, 12, 13, 14, 15},
+  //   {4, 2, 3, 6, 7, 8, 5, 0, 1, 9, 10, 11, 12, 13, 14, 15},
+  //   {4, 2, 3, 1, 9, 10, 11, 6, 7, 8, 5, 0, 12, 13, 14, 15},
+  //   {4, 2, 3, 1, 9, 5, 0, 12, 13, 14, 10, 11, 6, 7, 8, 15}
+  //   // {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}    
+  // };
+
+  // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 1048576};
+  // size_t idxList[8][MULTI_ITERS] = {
+  //   {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+  //   {0, 2, 1, 3, 5, 4, 6, 9, 8, 7},
+  //   {3, 2, 5, 6, 4, 7, 1, 9, 8, 0},
+  //   {1, 2, 4, 5, 7, 6, 8, 9, 3, 0},
+  //   {2, 0, 5, 7, 4, 8, 9, 6, 3, 1},
+  //   {3, 4, 8, 2, 1, 0, 5, 7, 9, 6},
+  //   {1, 3, 9, 2, 4, 7, 8, 0, 5, 6},
+  //   {2, 6, 8, 1, 3, 0, 4, 5, 7, 9}
+  // };
+  size_t countList[MULTI_ITERS] = {256, 147456};
+  size_t idxList[8][MULTI_ITERS] = {
+    {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1},
+    // {0, 1}
+
+    {1, 0},
+    {1, 0},
+    {0, 1},
+    {1, 0},
+    {0, 1},
+    {1, 0},
+    {0, 1}
+  };
+#endif
+
 size_t sendBytesList[MULTI_ITERS];
 size_t recvBytesList[MULTI_ITERS];
 
diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h
index c780398..3da8981 100644
--- a/src_manual_size/common_ms.h
+++ b/src_manual_size/common_ms.h
@@ -19,8 +19,14 @@
 
 // #define DEBUG_PRINT 1
 
-// #define MULTI_ITERS 2
-#define MULTI_ITERS 161
+// #define FULL_MS 1
+
+#ifdef FULL_MS
+  #define MULTI_ITERS 161
+#else
+  // #define MULTI_ITERS 16
+  #define MULTI_ITERS 2
+#endif
 
 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
 #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)

From 7b37ceae7c4d1e2a19b965831b1b56760e0d0530 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 19 Dec 2022 11:17:25 +0000
Subject: [PATCH 068/109] + nccl_test.sh

---
 nccl_test.sh                | 71 +++++++++++++++++++++++++++++++++++++
 ofccl_test.sh               | 25 ++++++++-----
 src_manual_size/common_ms.h |  2 +-
 3 files changed, 88 insertions(+), 10 deletions(-)
 create mode 100644 nccl_test.sh

diff --git a/nccl_test.sh b/nccl_test.sh
new file mode 100644
index 0000000..89c241a
--- /dev/null
+++ b/nccl_test.sh
@@ -0,0 +1,71 @@
+clear
+
+cd /home/panlichen/work2/nccl-tests
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
+
+if [ -z $BINARY ];then
+    # BINARY="DEBUG"
+    # BINARY="MS"
+    BINARY="PERF"
+fi
+
+if [ "$BINARY" == "DEBUG" ];then
+    export MY_NUM_DEV=8
+    # target="./build/ofccl_all_reduce_perf"
+    # # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    # export SHOW_ALL_PREPARED_COLL=0
+    # export NITER=40
+    # export NBYTES=128M
+    # export WARMITER=0
+    # export MITER=2
+    # export CHECK=0
+elif [ "$BINARY" == "PERF" ];then
+    target="./build/all_reduce_perf"
+    export MY_NUM_DEV=8
+    # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    export SHOW_ALL_PREPARED_COLL=0
+    export NITER=4
+    export NBYTES=8K
+    export WARMITER=2
+    export MITER=4
+elif [ "$BINARY" == "MS" ];then
+    export MY_NUM_DEV=8
+    # target="./build/ofccl_all_reduce_ms_perf"
+    # # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    # export NITER=200
+    # export SHOW_ALL_PREPARED_COLL=1
+    # export WARMITER=0
+    # export NBYTES=8K
+    # export MITER=4
+fi
+
+export NSYS_FILE="nccl"
+export NCU_FILE="nccl"
+
+if [ -z $RUN_TYPE ];then
+    RUN_TYPE="PURE"
+    # RUN_TYPE="GDB"
+    # RUN_TYPE="NSYS"
+    # RUN_TYPE="NCU"
+fi
+
+if [ "$RUN_TYPE" == "PURE" ];then
+    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+elif [ "$RUN_TYPE" == "GDB" ];then
+    cmd="cuda-gdb $target"
+    # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0
+elif [ "$RUN_TYPE" == "NSYS" ];then
+    cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+elif [ "$RUN_TYPE" == "NCU" ];then
+    # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+    cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+fi
+
+echo cmd=$cmd
+$cmd #> /home/panlichen/work2/ofccl/log/ofccl-2ms-coll-master.log
+
diff --git a/ofccl_test.sh b/ofccl_test.sh
index 68cc75b..881354d 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -7,9 +7,9 @@ cd /home/panlichen/work2/nccl-tests
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
 export NCCL_PROTO=Simple
 export NCCL_ALGO=Ring
-export NCCL_MAX_NCHANNELS=1
-export NCCL_MIN_NCHANNELS=1
-export NCCL_NTHREADS=64
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
 
 export CHECK=0
 
@@ -19,7 +19,7 @@ export BASE_CTX_SWITCH_THRESHOLD=80
 export BOUNS_SWITCH_4_PROCESSED_COLL=0
 export DEV_TRY_ROUND=10
 
-# export ENABLE_VQ=1
+# export ENABLE_VQ=1 # volunteer quit
 # export TOLERANT_FAIL_CHECK_SQ_CNT=5000
 # export CNT_BEFORE_QUIT=5
 
@@ -29,15 +29,15 @@ echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
 echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL
 echo DEV_TRY_ROUND=$DEV_TRY_ROUND
 
-if [ ! -z $BINARY ];then
+if [ ! -z $ENABLE_VQ ];then
     echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT
     echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT
 fi
 
 if [ -z $BINARY ];then
     BINARY="DEBUG"
-    # BINARY="MS"
-    # BINARY="PERF"
+    BINARY="MS"
+    BINARY="PERF"
 fi
 
 if [ "$BINARY" == "DEBUG" ];then
@@ -70,10 +70,14 @@ elif [ "$BINARY" == "MS" ];then
     export MITER=4
 fi
 
+export NSYS_FILE="ofccl"
+export NCU_FILE="ofccl"
 
 if [ -z $RUN_TYPE ];then
     RUN_TYPE="PURE"
     # RUN_TYPE="GDB"
+    # RUN_TYPE="NSYS"
+    # RUN_TYPE="NCU"
 fi
 
 if [ "$RUN_TYPE" == "PURE" ];then
@@ -82,9 +86,12 @@ elif [ "$RUN_TYPE" == "GDB" ];then
     cmd="cuda-gdb $target"
     # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0
 elif [ "$RUN_TYPE" == "NSYS" ];then
-    cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b 64M -e 64M -f 2 -t $MY_NUM_DEV -g 1 -n 1 -w 0 -c 0"
+    cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+elif [ "$RUN_TYPE" == "NCU" ];then
+    # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+    cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
 fi
 
 echo cmd=$cmd
-$cmd > /home/panlichen/work2/ofccl/log/ofccl.log
+$cmd #> /home/panlichen/work2/ofccl/log/ofccl-2ms-coll-master.log
 
diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h
index 3da8981..1785efe 100644
--- a/src_manual_size/common_ms.h
+++ b/src_manual_size/common_ms.h
@@ -19,7 +19,7 @@
 
 // #define DEBUG_PRINT 1
 
-// #define FULL_MS 1
+#define FULL_MS 1
 
 #ifdef FULL_MS
   #define MULTI_ITERS 161

From 5bb88a119a17069d327c5f874f1b18741e3a39db Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Mon, 19 Dec 2022 13:12:35 +0000
Subject: [PATCH 069/109] fix bug in nccl_tests.sh

---
 nccl_test.sh                |  9 ++++---
 ofccl_test.sh               |  2 ++
 src/common.cu               | 50 +++++++++++++++++++++++++++++++++++--
 src/common.h                |  2 ++
 src_simple/common_simple.cu |  6 ++++-
 5 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/nccl_test.sh b/nccl_test.sh
index 89c241a..2243904 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -33,6 +33,7 @@ elif [ "$BINARY" == "PERF" ];then
     export NBYTES=8K
     export WARMITER=2
     export MITER=4
+    export CHECK=0
 elif [ "$BINARY" == "MS" ];then
     export MY_NUM_DEV=8
     # target="./build/ofccl_all_reduce_ms_perf"
@@ -55,15 +56,15 @@ if [ -z $RUN_TYPE ];then
 fi
 
 if [ "$RUN_TYPE" == "PURE" ];then
-    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER"
 elif [ "$RUN_TYPE" == "GDB" ];then
     cmd="cuda-gdb $target"
     # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0
 elif [ "$RUN_TYPE" == "NSYS" ];then
-    cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+    cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER"
 elif [ "$RUN_TYPE" == "NCU" ];then
-    # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
-    cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+    # cmd="ncu --nvtx -f -o /home/panlichen/work2/ofccl/log/nsys/$NCU_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER"
+    cmd="ncu $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -m $MITER"
 fi
 
 echo cmd=$cmd
diff --git a/ofccl_test.sh b/ofccl_test.sh
index 881354d..536e56a 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -59,6 +59,7 @@ elif [ "$BINARY" == "PERF" ];then
     export NBYTES=8K
     export WARMITER=2
     export MITER=4
+    export CHECK=0
 elif [ "$BINARY" == "MS" ];then
     target="./build/ofccl_all_reduce_ms_perf"
     export MY_NUM_DEV=8
@@ -68,6 +69,7 @@ elif [ "$BINARY" == "MS" ];then
     export WARMITER=0
     export NBYTES=8K
     export MITER=4
+    export CHECK=0
 fi
 
 export NSYS_FILE="ofccl"
diff --git a/src/common.cu b/src/common.cu
index 9c2588a..716362b 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -652,6 +652,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
   deltaSec = deltaSec/(iters*agg_iters);
   if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+  // int cudaDev;
+  // cudaGetDevice(&cudaDev);
+  // OFTEST_LOG(TEST, "Rank<%d>, time = %lfus", cudaDev, deltaSec * 1.0E6);
   Allreduce(args, &deltaSec, average);
 
 #if CUDART_VERSION >= 11030
@@ -673,6 +676,50 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   static __thread int rep = 0;
   rep++;
   if (datacheck) {
+      // Initialize sendbuffs, recvbuffs and expected
+      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
+
+#if CUDART_VERSION >= 11030
+      if (cudaGraphLaunches >= 1) {
+        // Begin cuda graph capture for data check
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
+        }
+      }
+#endif
+
+      //test validation in single itertion, should ideally be included into the multi-iteration run
+      TESTCHECK(startColl(args, type, op, root, in_place, 0));
+
+#if CUDART_VERSION >= 11030
+      if (cudaGraphLaunches >= 1) {
+        // End cuda graph capture
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+        }
+        // Instantiate cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+        }
+        // Launch cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+        }
+      }
+#endif
+
+      TESTCHECK(completeColl(args));
+
+#if CUDART_VERSION >= 11030
+      if (cudaGraphLaunches >= 1) {
+        //destroy cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+          CUDACHECK(cudaGraphDestroy(graphs[i]));
+        }
+      }
+#endif
+
       TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
 
       //aggregate delta from all threads and procs
@@ -733,8 +780,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
       setupArgs(size, type, args);
       print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
       TESTCHECK(BenchTime(args, type, op, root, 0));
-      // TODO: 实测是否恢复？
-      // TESTCHECK(BenchTime(args, type, op, root, 1));
+      TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
   }
   return testSuccess;
diff --git a/src/common.h b/src/common.h
index bd84d01..745bd76 100644
--- a/src/common.h
+++ b/src/common.h
@@ -16,6 +16,8 @@
 #include <pthread.h>
 #include "nccl1_compat.h"
 
+#define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
+
 #define CUDACHECK(cmd) do {                         \
   cudaError_t err = cmd;                            \
   if( err != cudaSuccess ) {                        \
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 4bc04bb..6701244 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -822,9 +822,13 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   auto delta = std::chrono::high_resolution_clock::now() - start;
   double deltaSec =
       std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
-  deltaSec = deltaSec / (iters * agg_iters *multi_iters);
+  deltaSec = deltaSec / (iters * multi_iters);
   if (cudaGraphLaunches >= 1)
     deltaSec = deltaSec / cudaGraphLaunches;
+  // int cudaDev;
+  // cudaGetDevice(&cudaDev);
+  // OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters);
+
   Allreduce(args, &deltaSec, average);
 
   double algBw, busBw;

From d9f1a554abf857c00d5896894ef7ed5c320db43b Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Wed, 21 Dec 2022 09:33:10 +0000
Subject: [PATCH 070/109] + run multi test scripts

---
 .gitignore                                    |   4 +-
 nccl_test.sh                                  |  20 +++----
 ofccl_test.sh                                 |  18 +++----
 test_scripts/nccl/run.sh                      |  38 +++++++++++++
 test_scripts/nccl/static_nccl.cpp             |  42 +++++++++++++++
 test_scripts/nccl/static_nccl.out             | Bin 0 -> 43920 bytes
 test_scripts/nccl/static_nccl.sh              |  28 ++++++++++
 test_scripts/nccl/static_time.cpp             |  37 +++++++++++++
 test_scripts/nccl/static_time.out             | Bin 0 -> 43920 bytes
 test_scripts/ofccl/clear_static_ofccl.cpp     |  42 +++++++++++++++
 test_scripts/ofccl/clear_static_ofccl.out     | Bin 0 -> 43928 bytes
 test_scripts/ofccl/clear_static_ofccl.sh      |  28 ++++++++++
 .../ofccl/clear_static_ofccl_time.cpp         |  37 +++++++++++++
 .../ofccl/clear_static_ofccl_time.out         | Bin 0 -> 43936 bytes
 test_scripts/ofccl/run.sh                     |  46 ++++++++++++++++
 test_scripts/ofccl/static.sh                  |  21 ++++++++
 test_scripts/ofccl/static_time.cpp            |  32 +++++++++++
 test_scripts/ofccl/static_time.sh             |  21 ++++++++
 test_scripts/ofccl/statics_ofccl.cpp          |  36 +++++++++++++
 test_scripts/ofccl/statics_totalCtx.cpp       |  51 ++++++++++++++++++
 20 files changed, 481 insertions(+), 20 deletions(-)
 create mode 100755 test_scripts/nccl/run.sh
 create mode 100644 test_scripts/nccl/static_nccl.cpp
 create mode 100755 test_scripts/nccl/static_nccl.out
 create mode 100755 test_scripts/nccl/static_nccl.sh
 create mode 100644 test_scripts/nccl/static_time.cpp
 create mode 100755 test_scripts/nccl/static_time.out
 create mode 100644 test_scripts/ofccl/clear_static_ofccl.cpp
 create mode 100755 test_scripts/ofccl/clear_static_ofccl.out
 create mode 100755 test_scripts/ofccl/clear_static_ofccl.sh
 create mode 100644 test_scripts/ofccl/clear_static_ofccl_time.cpp
 create mode 100755 test_scripts/ofccl/clear_static_ofccl_time.out
 create mode 100755 test_scripts/ofccl/run.sh
 create mode 100755 test_scripts/ofccl/static.sh
 create mode 100644 test_scripts/ofccl/static_time.cpp
 create mode 100755 test_scripts/ofccl/static_time.sh
 create mode 100644 test_scripts/ofccl/statics_ofccl.cpp
 create mode 100644 test_scripts/ofccl/statics_totalCtx.cpp

diff --git a/.gitignore b/.gitignore
index b0853be..5999837 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,6 @@
 
 .clangd
 
-.vscode
\ No newline at end of file
+.vscode
+
+test_result*/
\ No newline at end of file
diff --git a/nccl_test.sh b/nccl_test.sh
index 2243904..b5ca1d9 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -9,21 +9,21 @@ export NCCL_ALGO=Ring
 # export NCCL_NTHREADS=64
 
 if [ -z $BINARY ];then
-    # BINARY="DEBUG"
+    BINARY="DEBUG"
     # BINARY="MS"
-    BINARY="PERF"
+    # BINARY="PERF"
 fi
 
 if [ "$BINARY" == "DEBUG" ];then
+    target="./build/all_reduce_perf"
     export MY_NUM_DEV=8
-    # target="./build/ofccl_all_reduce_perf"
-    # # export CUDA_VISIBLE_DEVICES=0,1,4,5
-    # export SHOW_ALL_PREPARED_COLL=0
-    # export NITER=40
-    # export NBYTES=128M
-    # export WARMITER=0
-    # export MITER=2
-    # export CHECK=0
+    # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    export SHOW_ALL_PREPARED_COLL=0
+    export NITER=16
+    export NBYTES=8K
+    export WARMITER=2
+    export MITER=1
+    export CHECK=0
 elif [ "$BINARY" == "PERF" ];then
     target="./build/all_reduce_perf"
     export MY_NUM_DEV=8
diff --git a/ofccl_test.sh b/ofccl_test.sh
index 536e56a..3465366 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -36,29 +36,29 @@ fi
 
 if [ -z $BINARY ];then
     BINARY="DEBUG"
-    BINARY="MS"
-    BINARY="PERF"
+    # BINARY="MS"
+    # BINARY="PERF"
 fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=8
+    export MY_NUM_DEV=2
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=0
-    export NITER=40
-    export NBYTES=128M
-    export WARMITER=0
-    export MITER=2
+    export NITER=16
+    export NBYTES=8K
+    export WARMITER=2
+    export MITER=1
     export CHECK=0
 elif [ "$BINARY" == "PERF" ];then
     target="./build/ofccl_all_reduce_perf"
     export MY_NUM_DEV=8
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=0
-    export NITER=4
+    export NITER=1
     export NBYTES=8K
     export WARMITER=2
-    export MITER=4
+    export MITER=16
     export CHECK=0
 elif [ "$BINARY" == "MS" ];then
     target="./build/ofccl_all_reduce_ms_perf"
diff --git a/test_scripts/nccl/run.sh b/test_scripts/nccl/run.sh
new file mode 100755
index 0000000..0e63f35
--- /dev/null
+++ b/test_scripts/nccl/run.sh
@@ -0,0 +1,38 @@
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+
+export DATE=221221
+
+for MY_NUM_DEV in 2 4 8
+do
+    unset CUDA_VISIBLE_DEVICES
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export RES_DIR=test_result_${DATE}_${MY_NUM_DEV}cards
+    if [ ! -d "$RES_DIR" ]; then 
+        mkdir $RES_DIR
+    fi
+
+    for n in 16
+    do
+        for w in  2 
+        do
+            for m in 1
+            do
+                for iter in 1 2 3 
+                do
+                export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
+                ## Time
+                echo $(date +%F%n%T)>> $RES_PATH
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M #16M 32M 64M 128M 256M 512M 1G
+                    do
+                    ## Test
+                    /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH
+                    done
+                done 
+            done
+        done
+    done
+done
diff --git a/test_scripts/nccl/static_nccl.cpp b/test_scripts/nccl/static_nccl.cpp
new file mode 100644
index 0000000..3c8b2b9
--- /dev/null
+++ b/test_scripts/nccl/static_nccl.cpp
@@ -0,0 +1,42 @@
+#include"bits/stdc++.h"
+#include <string>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    cout << argv[1]<<" nccl : "<<endl;
+    int ranks = *(argv[3]) - '0';
+    string str;
+    stringstream ss;
+    vector<string> a;
+    vector<string> b;
+    string line;
+    // time
+    getline(cin,line);
+
+    for(int t =0;t < 25;t++){
+        for(int i = 0;i < (7+ranks);i++)
+            getline(cin,line);
+        
+        for(int i =0;i < 6;i++)
+            cin >> str;
+
+        a.push_back(str);
+        cin >> str;
+        b.push_back(str);
+
+        
+        for(int i = 0;i < 4;i++)
+            getline(cin,line);        
+        
+    }
+    cout<<"nccl test algbw:"<<endl;
+    for(int i=0;i<a.size();i++)
+        cout << a[i] <<endl;
+    cout <<"**********"<<endl;
+    cout<<"nccl test busbw:"<<endl;
+    for(int i=0;i<b.size();i++)
+        cout << b[i] <<endl;
+    cout<<endl<<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/nccl/static_nccl.out b/test_scripts/nccl/static_nccl.out
new file mode 100755
index 0000000000000000000000000000000000000000..8d848288d03512aa6b678c42f3722d18281ff96b
GIT binary patch
literal 43920
zcmeHwe|%h3weOkyXakfq)D|OcVW0)ekH$=Xv?;Wbw3#w#lQ!fBR4ATKCevgjnF%vz
z+SCV9Zd&Y^MhYVEgy*fw)vD;LUiB75O=&UZevlh{i1C706>v&HYXGg@Lhg6%z4ke0
zPG(Y!!~MKJa@xr{d#$zCUVH7e*Zy&4&c5ASzdFa|($tl!eORNEdx6AMGhxq{_y(X_
ztJ2QKU$=IFHVx@Kfs^!V2|$(CNKPs&V0<nh<twI}G;p3oYYHk22~xgH*U=(LQBdPF
z<;y3MoBrT(i@zppoh#D{+VhF>s+r#bcNa>$g7$o5o8gU;KHQZGeO&JoLMhN#Pcj$f
zOLD#>=ToqV?Wv%$C+SF^DwgYnZqVp*i@%!Y%(sE_IbkjHD@bL~)%>#2KAG|wIbU9l
zl;ecExqb?&@~%Zbvdg=@*qmAumzSv>O1a(&y10N3b%olNRD7tbeQ{SP9PeG+TeW2I
zl8Ta8q@+x6kX}><HQzPsS~aLWNH`WJ+F~}RiGJ3%Z)hzkeE%nFqO*T-|LmXNx%tJ<
z%~CR`OyZ%7^0Nud^aZ#h9$Eiy<=o+#jh?4z7vZx2pGEU`&l?Ut-5tAcdQL~Hzi7?_
zGgbke_F4?#v(ixh3k9UZ&(6UA^$hrPGRUFYrITNufxiKUNXMVZ!2ih%{Qm?ar;`)Q
zpwA^4>UAsw{=y99zM4VL`!eWra|ZlxGRXf$2K}$j!2iJv_-8WU@5@kM@;f&^&Z{Yd
z{@=)er@rHM3PJj+4EUdAu;*Qne~va+tL~A)R9yQ!@E4_#|0LuP|K3|9u^4H(Ze=-*
z4mm@x5Aknjd=c|&hz~KNJ#hKu6cd(cy57+p3F|S#A2oDc)9YFq^!8vh*b#~u!Dvgv
zs;)>l*y3;N3QAgf$_>p%S*flYozcivy(<{*Fgo>MG#ZIEt_^59s9mwTK(pcT1Umgu
z-H7@_1`yt6KmK@|KqznX$3g);5;LMfe|KGr-cqW2o0sZMYu7a!OLaZa+w1Xo$|M(<
zLg9|O<_f*JQg89-UN6eX%$Kp?RUYpap<P8V+}@Q=!3lI*0%J|G(Dl&Vk-5rVGd9BM
z^)3&D!ctIGC=$~lBUn)v4jHwwV(XeCT`_Ot>ZayMvraaGp<zqeN^Smj-QU#}3BUv}
zuU~INn}_v)-w1SqwY{rR+Gc4-(7;d<OlO{P?FXlEv|r|CNohX4qqkS@2}Wa)u)iy0
zY}2=tIxLjh_7mBrX|1%c@VfC%m#$3%E%3HVU3zPHYj--MN^cRSiUx%ZC#Rv=IF?`k
zU9Hdgn~jP<ByMoy!GX=DYc`_quD4CLwGcu&0s%cn5yYeEt>LX96yFq##k+$PUjiz$
zAgTmz)&re4>zn<dE)8*q{8gCHZv=Zo2BD}Do1?)<Pf*b|hsX&&j)<d^G!9PPeq_F8
z)hfNbq*S}6zHa5J4I6M@QBt-<!)?vVI^9!JZl~DqOUgVlsh5>hn0I9*m58CanDyj~
zzj^qZi}^|}<}!qp|8lhH0-rDVXy#PFbDccE9AN1qFBfJ!wMeLOqVbY#XA%F8nZX54
z3dKZk#AnON`!Qlo(}p<D|NFl;qkw$vIZhjw-h-hbPx}q0ZxQ^t+Uusg9L>)((qMKd
zbhZY4PcbfjUg!*(nN8#Qx|+|c@kU!nxioEr=j#gp#v+M7>c9^!koW=~R}{ZWABjjh
z&H3E&R{R~d;aL@dAF<&-VB<e(!}AhHP?I*iT4NF6gbhF6Cg-FLf4&W0#QIZP6x;A>
zJSH7!&Y&(ez7pO(KCAJS@N629MUB&hzffT0pCZ<0I+9ejx{7UhID~n*ZFmgn=C#O%
zXIBukQXAgB?x?ciscdys+wiI_iRQE6;XvlqXv3drVl{1p4G)JmuT3`m3=^wqoi;oi
z*1URbcwV9kTCWX1)8y8)?KV91DRu3#;m=V>z&$p+eSZCb4gWqH|6UuO)<x>tXT#4?
zNWlFz{JA##0UQ4PHvFIsf1V9LbvtE&DGN+lV9Ekh7MQZYlm(_NFlB)$3rtyH$^ugs
zn6iMTy)pCrzxn!JEAaK_|8hOHD|QYVIb*}VzQYBFgn7m)|8t$DjgDRXJiyEg1b-({
zMvwhwY;0@~<?o~on9*a;m}%O089jE;Ow)$T=&^5^Y1(KRJ@y4NO`AKT$L==MwBa&(
z><%+c8!e;9ZZXre!7_TxZ>DKuW%Ss!W|}rsMvv8)Y1&8`J?1gfw1F~u>_Rh58z-a3
z&N0)pVKREmWu|9}^q<~Q^`#9G)Za|g#t7<frfEY2^*7VB5rX=gY1#lm{mnFOe4zej
znl?O8f0c&y=L_rC%JxFKwSij0N;f#tUPpR`BVFl8FLtCacBJP!(sLZ?vmNPtNBZsh
zRDIuaq+fTWUv{K_<w*bBk^ZS8{e&a^Lr40%j`TMi>3^}OJ+Id#p82RR@mpWtOD7sz
z>Id?tVXM<OaQ0o;f$|NkIEdyNn{yAcyt#*xxbY3-f1kjzlg1o$tuWoAYn?H6bmj%y
z!LA)*2>!bTf90nLT>6eLal&`-4_EsRp3L>Re&T!P9b+~Gcv(Qf*wM|R-Bo_-f7@5Q
z4)@UFm$&-*R*bI2tuOJiF~irt;u2&_KKl09Sh5|?{FD4Az~{ONdCl@E|FNwg(oHM!
zHT&*(&4&)0_~B#7=SzHdKR~;$Kkwp2La;aK8T9qfnN14y?<AtHZxG=FRj$;rlk7_j
zC9i?!g6Akyv3QCk&kE*QAb3&)Bz^*G97m?)b0l`4@_o!ZBs6&(c9>5h!I$_&@@IrJ
zH7JJkncvf9{%Ei-S(@1Sf)KFNO@%<AS7dS%CqF2YM+t!X5aN+<V5i2^0hz4k<kuv#
zOp=BnGOBTEY;0)fVbEX|t?!3)S3L8sQ6z?T9$|F9(4%i~+A@K<{Q{I?$+?i^Tmd<K
zg6cXxJhbyEQy|^tiyUsD2$lRtvdx(w^a)1S@e$v^X9am+r>x9#R3G|M3QDr_{Tt=`
z8A`I)H6c{6KZE)X?Ho2srn_8+u5psE{lBPWBBSI!QF)h$l4z=1u;~g77q0<dF_n~{
z<_g!D&9y)n>{%|Qe<$7f0;m9MK9qct?7r;K%zFlR3>s)z$uVql2xXrlCgCWLLw0fi
zD#@a9t)dPHJt@2E_<hjr6+94uZg*3LTg?nV^O?GRl8^(HPfETJ@wJ$IU$gjhAcuC2
zn025#SqEv(p=7?J4w8fFU^94uS%-HZJNe|_R2{Ciih2{sK2a3ap+A2ilLl7&JLUPg
znddAk&mbX{hYck?GO1MiB09Bc%c10FWO5UG`9jI;=Hy#{9UD79p6%;zNVfaRXNcZ!
zbRXYLal)54WGm-M@R}7IN?w^-&gH4)JO<|D(=8#Yz#SwcF+%RU4XPw(lRkGGrbu+-
zO@~x)!c3(oaoedx{AAy4r(E%}J8nBk9WZ_=n)f(0+k8=tIVIOnT`Nb(C;Ib?C<Vcd
zC}Yo)rm!DT)*sW{CHW8npskd|Z6|$+_z7RX_r!SYp7AJR^z|dac~4T1TlRQt8ljHo
z5yr+2?R_Mh_@K|{Q8e@sb0`YER69Gp=`{4iG(oZIBeF`hQ3PoNrQW?Z`(O#B*<-k$
zNVBIWqW3qNmbmnBUt$<TS-$9_WF3qYRQac={twtn9D(CaRN|M3H-VT!LVQvRDklX!
zXcH827)9CCG)yCBt}a4D8J<*$ttVLI7A3O3y1+Nk=$2|X7977w-VG)15gHxz^&^i}
zr&(~=lzOQ4?pNhvkXKf@$=3Fk&!K{72uMBn+kBsE*ycYh%Z5?4V~^T}$db17A-EDZ
z?@)Dt?Or_VbGS5;cAs>cj+JOl`uc7=p&9R&RztGCL0N2WhO$$%O_gPuPK5RuQh=?m
zJGi|`@|u|<dcqS>vSoM9?9>~v%bK~Iq?rqZ44^3n&^Z#KSRcs~eWl@~73uPm_q;VW
zX6szAv_n>srs-%X$~9ZY$Nc2Rlw1ekBbv>c&FJEtFy}smm%YA3>jQR?Vq)dp=Np)F
zEfx$vqO4Tkf5+;{m++G6`^YmA-XroBj+yWdi+iwrMwrFB-#5^Tj~CKK?T$$Fh|JPD
z$mq8u`Y6yxA>{zz160_zDOx9}n~nGm9!0l>e}ltC07{E)dn1hzLtZgFBws)R!}Tqa
zK9qcihC~`3NQ0r|e42{%m%>U^)Dd6emxTSMEd7A5&wIeFa?%XSmh*SSBRT?<iw|>7
zgL58_sB8nN*&Y$uoCWSnS738$&S=^K!<;ku2Dtl9xKLM$0I<bSa+VN4o=B~L$_eun
z5>~WTE8456$ai_`5#PbT<%3PJ{4-6EBnzxx#J2~02@xX(rRAv+V0i*4t+8kiZfGc$
z6Y-flJ`4Xi){j4OFda&|Mj^HDwgXyx0a&^HQl$u&cmob4!kDN;gPN>0kP|>1QHeTB
z!$Byn(63+ul>92n;Q>7PLn>1a+mGWmd6UrLaHoCDM%1zgn7-HCDFLOQqlnd)Z-s)0
zVzxj~WadZ>8QUD3W`H!~Kuv2+3-!81kT8N?2S-A4h}IfP(g0eQc*56z6VNaI*bQSQ
z|MC~Y9j;IOp+52Z<PT3^x-y?MA-=?mggXw7`b3gE;ORe+$qz5Ov1Vh<O}2h+scTIP
z5=Hj7N;>(#b_6;NLZ<`cBLPT5$!A4}Jft-K7n&wrddQdfPhbWyfXUf1)_Ul}V4n|t
z74J4Nr82XAOeEnK1Be0xfHFY*2(_Vp<^lrE&z4z1%*(t-qq7udEhz%bmO_PD28@{E
zAeViTn4i3!wCFqRvwMMT_VMu;t#8HR$0)R<iN(_(&~gZ}T`_EjCWw>9fJ4cDMdSJU
zi;;x04kaHJNsP6y6}kduAbp36Q}wZ&N{ne(r_<bmW&om_Y)1e-o-d-DRWwR2HA_yk
z!e$M__LdVv421)b`Yq+!Xs*zI8;ump)ksnHM|+)w$3OvVv{YUzYsqD)>Rm$u5oWpa
zRU$crVkr_qt7j;PCtgh6OayXJOD?Kl)jYL3#lW4~@f=;12F;=5qCce0J5vP0hF5t{
z7;}X+!MswcmZUHGBi^5MAAde&hN3cy?vJ5?*_Q*RnBAtc9QePF7KRN(xcZcnJ5}!l
z0vH7`5o+Qmlxm+#m+LehN;6;Nhm6$OLi_6}t?3MMLW6OE?w^GMW~@qW8ENaT*HUe5
zM!0d&kqvm2BZG9<5vP+LK<J|@7rj0yeP$?GOO;EY-~sf40qR$xC!GC;3=xhh--43t
ztCw`DrM3gBc#0ZsnfJuZ`)Q@0pR790y<W^XhLTC7t+to(n*=#~0{na^Ni$o-YgFMV
zC8@vKI>uK;5*-5*rNqH{Y~i6(e3_7ahuw(U>0>sB0SeeQGnd*`_=$JSC3`vf6`4vn
z;z5YER5u%xD~{0$RI9zoVrE0w${J@$iUn0D9yR6>EYUECVvm@uca?Y8SPqP^$<sWu
z;pp&Vk)$cypbQJX#D64*g%P|XZgk3|`}p@oCHf9m*&@aGQdNlqh%?p@aulM_kjo@$
zDCrh%WCo{dQT&V4I&+C4G7cr5lM0~h{Z#hxtH{*EwCotU19cXvpYXsxVUS9CUbW^O
zUflD%V-4M=Dh_Lrznp}%5U_+KK#UP$wn7s&w6NJ`Pm?0lrNMEpnhlxG{{(gvQe$F~
zDLOTJ9)cXq5N*xR?MTC`xr8i2QMM>AWRZLHW1XO!%{jmc&;A0bsY3ow<YZ+H#t?Wk
zX<nX$p5q5Oryby^L&@ueD6#{`LgBm3z}jl+R6^wZU#1d5r!Goj%kJJmXUV1uo8C_H
z7NNS3gMpF8GxpXa2t%@BKy$@z!4Sh0yW#h;Vku*oW!ak!Z7!L4!44>5KA)<&zJ6fE
z8y-!oxvnnpTup0T;;ovNn#8GA-@xM6fNrk8^ld0pkQ{pnr5t=a*SNs*JnzTXCq^;#
zt4;jAW^DGaeSL>qzGc6Pzlz!w+;|gB@@hU>qaWIA6-iA-4KJLRo}-yh&V)u&f2J%j
zWq~OR{C{f!7k<;CE_H`NT41sDfnVHMzrm@-{dRoB4=#wA7;v4I-^palLsKFWLtg=)
z$Ntmtp)$|FhaTgfh0hFp#AFqJqMBcGhXa8w_myr<KqDA4-2SePwyjrcmzkH1))tQu
zZA&m<M50$-sW<5OZ5{n$PY;D-!Kk6R=H_0t3-xJ4IrDx!Huf0qA38cV_7dPDfTsZG
z{dR2ZVZi*~jg7qtrN4+#sRxSvkwzy#3?^f<fj=Lx0?-540=Nm#0Nf6EFW_Fl?*RT7
za2_iD2f*ck=V8*b5%5aDTLE_i?ge}n@GxKtW;rhb-U^rpW9<c82>5NlTEL$Hwgcv2
z?z{tVHsHg6>i`b}ZUTG>@Y8^KF!WP^3jyb05?>401V}%yZv)%`cpYXIj{pV%p8~ue
z@HN1LfHTlY&jMZnxCSfp4S-#My?`6BBk~a7KETHSUjuvrZ~>-XrvPsSoQDSfG~jZ;
zR{%Ew7GaIC9dIe&UO)qI5b$$=M*)ujl2g0`=muQ#=Ga&@;C8@GfL{XK4)|Tby@1aG
z4g!7{`>{s>8v)6o?gMlKz5`ebIPcG>H{i8^I{<qC9|qh7co^_=fG+_SVFp?NXM7T{
z6mTIPJT(F~0`>qJfO`NFfcpWT0UQP_#74>qz_oxy@RN@MmI8hauo3Vn!0!Mq$29X4
zp!#i$>*GzDtGCECcgFOBJ?LPB$J21F2mbjXK&^JV*t%bc+X{R(yny;sM=UH_U0A$k
z=2=?{wrf|PcjaZ}3oimz<lhLm|5wn15PXqb1D}1s_fUSmsI1-iyoM<f?Q?71!lFBK
zR-HYq0SX~W^hfadE$H<17L&flqCW-n&p{XZs{ETR`fH%S1iJWrn4({2(Pu!PiRf3H
zhE8g^KHgX_EIIwbi!ax-xyqhoH~O3bd;TBvtGt<T5~YK$FE@90o=}A9c`wR&aDsY#
z2lTIj4*8ahHme@bg8sb>^>iZ(*^%hi0_J@I)>V4s_=FJRq2(Ic!bd!J*m!QY%2|&1
zRPgfH*zMr4WNb0(u@Ur6&_8IWf7Hr<E9gC-lU>aGH(K<EKsP`azxPr))>-t&K;H~{
zruKY+>1B5LAuInW(0h@8rk%dpqR&IzB>k7!={**GIq3g1fqol7C;dz8`GZ#eTRHy<
zJKeD84}rcH`DvUm?UQqjU?4wv4D?~pm)PmuR{j@2e+Be(^&^{q2lamr^fv+XzJy?_
zZ0v<UzLH~>vk?CJMT{Hi<d7cpyV<)z&t&Iz(7y!wWp+LNmYzF6{|4yk+IbQ3Y$Q8?
z9>;y&0-K(71L!t=`8feg)-Ytf3f>FtWo?jjH*hCFe+~2#b~@^t^IJ>CY>Zn6Fb>w+
zd2&_>2C8ob=r4g@Yo|j7c|OrwKrg`fxYSM$Tk;LiF97``c6v^uDgR#3H-Mhb9&Y6M
z4(PNCvB937>R^k7BjEW2crw}iB<Q<9&(t@IVdlF)Uv4iWX4Rnz^hZFy#7@7-qHh45
z<_ej7zZdjD&@=h&1E4<#dZsow0D2DQ6qWY!yR7m@KraS;ft`MXML!ApT+q|mnA)%y
zbBD7*&lFp#K%W76$Syx;lPG}l(;Vfx3FOn<#RvLod;Z&`es16%06h$Phn;@S?%ck-
zoa>2;%Aonqx4|QRAFcXst5rvu2ki$vlZ|M8^uz@DY2I`g^kTaX>n-^-pLz`RbT)J&
z&qmO{3Hla$e)7{!DPtk<WJe8iz()b{nh}UpJ&5n?R$2QX>u&JfXqT1KBpApRM?ilH
z^egOiTU<zjJ_0(GX_jFdYYH$NzX19iJH6g2e-Y@<fbO%?q2tHFLgo8F|10R3VtFU%
z=VH%3g^v6Y&c6%va?sP+lH~6L{Sweq`UCQRg7Y5%y$SS8`X@oZ?D?@Vd{5S@e@?Ba
zpc|+H%sH<I9bZYa=rSY`Mx$7F1<gxQMvxGz3ybc`sVgl0bgsA1J&@-uT(mpCwy^Zh
zX}-d$zUkK#R)-6#Y6?qh3Ky*`bgwKdURhYQvamqBKf!>UD#$?yGB130?%cH>!xJ!9
z{_6$WH*@}XK3Y6+23SB5AKX27v1{&CH=v<iqpn}&xsK;9f`jK@_)Na`yL@nB+9NLH
zC|Hee1P3YigCB6s{80W@*WIo&@wJ_AxNLl8;ZrwL7MQZYlm(_NFlB)$3rtyH$^ugs
zn6ki>1*R-8Wr5RKKx`=B`xsb?(xu+Br`DoQNPDGpDLlOckggdLXqsEzoXu(9Rq{rx
z%W<pT12~uI>e-5V2bp^BAMF{?g>4UUz4gvmgl_g?FO@DVgT*EGUvP)*Fma7A9ZPg^
z&0<D8QWBTg3&Ne)XJ8xw&ZeCVOC)x3#I=ZT=(!JFwB19OdY9UPLYY?hDz3QFTRdX|
z3$`!BrOFq3Uce|md{JD~4|WwOe6d%gA7I1cQLacUJ@NQZq}MPVkHtiqn}bWs9#HR7
zcS<aEOH4JxMuwXh_AuPea1X=14EHk}WH`+5D8myB)yoWvF4V;3W?0IwnqecuO$>V&
zZfCfMp_DTDzv{h{N&;RGdZl|&Yg;^Q#NEsAcF@wrp14SPK2cU$Qd&{sxm4m+j?6F@
zy{lL?w2h744qBBKpR1|&(A(JI(|OvhY5DWbew4~TO;i0R6+d0>bJ~%(&xNZh|Fe^F
z;qA(AQ}JhNs(+;7&(c&sO2xyu-FA$;&&6p9?zH$qZEu?P&DCaV`_tl!%yS#;rR3rb
z)+%nL;<39~ZO6#_T<v|DdN047nv18%dsf*oihqt)ye=L7Tr)n}skw+YDlVqti?ww9
zGFSV68OKuj5oLSq7<r#-dmp%+nrogWVPRdk&C~ugW?|$Vg`phmjy5hxrBHYz{B{SP
znuG9ayivL6nUf3p9EeH`l}8snXK<ylLpSgwN4-y$c<I`Pk4qbG&j)}{r~f{|KT9j9
z#tkl2?jdnMOFN^Quoiwq+|Sa^tR}35e_7nm)gE=||E9pt(q>fS2A7J{^bKDZ-nd^>
zEzoAUiy42817FSfSq^*?<7YEo$-jm1#SZ-4jGyPg|10C&4m>@RCp{NB@PB3eA_u-0
z<0kQ6;lR_coe5v+z+cb!3I{&U_$mkfUdAtX;D5mQY6t#zjIVXz&&rkh_#F6h#@9RW
zH!{A_f$w8{iv#~<#&2-o|AX-x9r(X7ev<=#F&dQY*6zTsV|=FrA7gx%1OE`?dmMQB
z-6P329C&)(O!!_0{#@}KVwQHR1HXdt+Z}j%mO*lMIPm?9-{ru6i}AZ1_+K!7j{{E=
zK$3s21AhtQA8_F78UK(2zlHI89r!OW{$U4xi1GUvujX^F15a`3%ZoH|iRV_hn~#Yi
z;njRw&5O~E(#9nZ_+1!BX0MZS?0?h6v8%;Ibo_WU>mn+5{CL_V<c}XmZWMCP9XIZD
zF<#B%l|Hu%IpfEj&jNoD>KiJRoMoKhZQ#?LLvaoaNc^8$A{jfF-(dVz6%w(W@y`K2
zi|_4VJto{F=lshgRy+sC?OlxDacL_4CB{D|zLcbCO3(T;rJV0EUdh=Fe35pZR_qv0
zKg0Z^ET6M!e`b6O*K0K^yB7l8kmDSeuRlwcyUrsSFJ%6wfIo+7TrCmkp5l6*`P&`!
z`Y@V2oqgJYC;MzKm-!XXFEjA}MeuWBHt;^IcT`>*TBGkeVpP-4SSYddoiV!n8Th{n
zJn1vLO7cA_fmU89@gtW>#O;h906ty0G><$d=RD0hKYxPdJSx6SV7+7g?3r9H`_%_o
zew6V}`~M#JbmdlH3PSqlh%e3Hd#x<zPT<q=4`<-N=o~4h?Mf-2p5@#N{Dr8OdVf6J
zUR?WuPbZ&{q|d-I$>(GK>lnX>?WyobQ1Mwg<LA8(Gk+!f)rVNl_kqX%ao3snawC=7
z!+v{$IXW5d?2r2xKT;|g<IFz~3vQCr%k6Rr<D-oK3b&)u;UMGJF`j`|Ge^c5^&Wj?
zw-(?@zI(A0%-vG^7UP}qVcNOr>+8;dzZZCt?~Es>82>@;cOT(;wP8bn_&?0;axLTk
zjq&$fAra#H7q~6KMuMt4vkI^hc#`ul2Oh=qFyajLNB08Bt=jP`%>Q%t6D5D%2hz*;
zXTTd7@b_fEe+78b^Yh%kC0vODghImK!2W>lCN7=v&iM8?@T!i?D!_}tQyg%{=T}+I
zi89H}-BnwI4Jj&jGy4O5r;x5+Gu|1`S7L*T_;c9NR6l);@lUY+O1F#WOa5YR$7NEy
z)(^bmVUz&x0H5w0r8x*JRBk)VQFgnT@hvQv!>;xN#$U(vQubU5e;_%%%n!F0*Dl5%
za`gAl15e|fbDZixoG-$9Z7|9L7<^s$TPluHZ?Sx5{8{Ce`aJ9yzwQP;oqfK}{OhlT
zP+W><76wYH*Hi3QOC(vlA9&K!=?`CGe&_i9JK&W}79l_Z=26s-hPm7z6N`bTa%Zsq
z=;q>bGk%2qB*ggJ7=I7PfenoRGvjx&V=5iI=(i;QLG~*S6WSi&6%UIL;E@dYpR$}y
zWs+O@?FDcclE0D-reyA7{4o1lBNsBl_|LPRTNuCMVkzgFY#-%UcQd|~`PKM1%J@fE
zpIVl4Cjvg{Q^4gazx@^PZsxIrzcRnU@)6d=wSmSpJ0sus08jFr{`MomU!+y@zI8tH
zKh6Bk_`G0|l(U1|7v>e$F2>JdJE*w&8^$~1TNV6<^x4318^ww1Q;dI={Y14p#iMlR
zvK4@V<ZNd-XeM!82Yfm^#F_u6Rg&*U=6@b|H{aXAtIY3aJ9D+Q8&MvWdlB@a3&Xm&
z+JSfTy&Zgl`4@5g;bzu;!T2UdD4nlHn&dnE;XdFMk6peNh#D~?j$fVzH0xYB-RRZ>
zbh2CwXFRn>^p37bo4-qM#}A!ix<B5l1tQ%&T|pz*UQ$)M1ZPa8W72U>R7m$nqyBAr
zFl<D(;TS1@cTjJScXw|?7F!Cxm<kywtT>v=z=3o0ORSPWPY;grYBuUU%X&h=KyYg)
z7SscgFb*?|2XMlcAeD=sUiqOR<a9{nl2AuD62&=H#8IW|p$L_O<ho@zS1%l=W9#b6
zJkUP4*&pvRbo_YhX1yoU6$)&V=dsm$N<j-o1%*rUpeni%v7<`;?U5dtouuPXvq~Jq
z7wnZ<`ol)7zA_ZR@oywGEqO^e5{tLRGSaIcKh%vww=!}p4Me)SaBNpb22ZJXRYOg4
zAR6j1GBQ<)gYGhttAf1-&N2(OXXGgNZ@oDqv7#dy?8!*23{FV)l<Cwq+rp9VkiRRV
z6i<0Kosp;7^t628h@rPdPA`x6g71>AEwc4pl^dLZZ<(xyKP+4^t<!iaBoC<;j(1H^
zhb58`oyNeivypgaNAoaiPc*ni_iw?uaG7gTu{jcj;M1GBlo?|aIF3h23rEmxdM2p#
zQkJ$k6pa~rB-}LtZ@D<;5&}=tH$2Q|Z0nhz2bD>#U@YLDh>Z(v!|8g_;O5Ns<3ih_
zU_32Xk3Sj=pO%YcO;CXfSpgg_c^XDmE$R<<OdyWAVm%WUT4@%F-XHX54nCeG+zMiJ
z!TEy|G^{F`jxc1aAi}o?4R|aD3cVR)$`%|`*iOe28bLiILRJNWJr6B36d;a-K8}F~
zC#?E$L}ELB@=HSsrAtF0O~0mo{mPnpWQn)wb-Fm=P}f&C)iii@@48w#k5$J3%evRc
zNMCJ}rq_OCT}?yXD$r6!wf0m5w7^e<RSD}?uWt6@RMnc5^<F-E)U(7I(fCAGoadSO
zyv~eAZhF058pT*z5a(+y(Hn7^W;dViSY0hnYsA5ebljpig%puC8pL2ohV$1oPNq;4
z0(DW0*Nd}aTl8i$NtND&<6wCREYoYr+>vcGp{T2GEA@IYDvKtFY@1w}UhfuBkf_A}
z#KEX+I(5(`CPEn9C)r4~W#Y733iM_Ab>c_m;>6fVHMuf9AEBFFp*W<L^$TGZsJHtK
zzgf;CE5AxqK1iZW_sEtFEOU~&N>ghF=qV2ex5~42Y5F)RdwZ+Q?$zD6mQLoiO)=?E
zU(7?hF@p}_&}LD{<ocAcM80jHj(gV<^e!)Igd=sAqF>dmM{vU#u6i5;?R#uT^O)Oo
zM;52_3sPjQ&D`&;H7e+oW$H=JLzAnkZKu#msyfKl8388|?_?uX=qWX)Lv1OZPVW5b
z6j8D@=@9RD*c`cnI8+!jB^f+D<%%Yeh$H3Qku4bc{vn_9B5<n^zDjRi$>Cc$;v`H(
z$BK(_VOdW+)`=L00UQooNs=cxr`%x<y*(DuJ27h0$>+#drG}Hq_1`UxEuv;NKg~v1
zNEvt=(QC!g)wV+3+s=Tn%Aw&OI0`+*@Y1DeJjsf4K|vTrIyX#_?il%u^BRgk@2PGc
zrJJq2vf&iV%)JYW<7@Ci6ekC)?OBhxl2&_t36>2d*rGVyq3eIw!Q?dZIqbDWbRAEr
zzDn(;Ose;+5#8BV1F^}GjS#_;qGqGKI~1<#hBsq^S4RIGKJ#~3B9rH=Z8itke<ZG9
z!qI?J)q~he>cWx@6M}%yc9L@=9%=0piub?^5ht8}NaJ>PEK=3&@6}_pdqq<t1P?63
z{O#@0U@R8dJSms+qPx&SMu;oJdS(2uh?PsW!W3p#WF5JXsiCz@69zG?WxTY83Wd2_
znM){mPd2>YQ;B6P=5pp@?7dhah?%#TEMc{WHMg@a%|=(OE@KdGA`^=}Lp_4kT+rWL
zhh=Q3?uE&k)@JTS%@sO=23gSSh)Hr0mCZ#^y7h{=*vd*fdC*R3!R)2AHI}-x+LaCp
zD_Cm~nx0O|<JGLmwoc6A4uO44s?^tr`528#Vza;;ne-dlyBbpm5zzrFXrfuyl)1Sx
z`j6MUp3Ji(8WeL?jC(<Ba|MF%B5#G*j9U*witq`p?Cpp+>MI7Y@wFAc5A{5OPJdMA
zU6_CuTPWkaJ^+Ow1+hNGpP_kpYqt}%t~t^b^ER$-YK}DPsT}|hNrI7{V7ic!&H7vG
zRcc}9tTY?r3#N8r9h_C?xB3yS<z-i(4LuCCfzF%t&Di|2;#UP0fh`TI)K;B%@RGf)
z6^vmGYS$p>6&B1=)L%O8Rf2U^hqO-v(k&h+_$)<KrMaZ)H(9K-DN5|6$R@_@3<Get
z|K^}B7o3~1-4W~!1o24BTzpO@Hc&5fEMY}Q#rPtZYt05d3)t$9w%1KG*i00CFpWqb
zfDm>#0&(;)tNss%a4gjuu+wR78OhyPnixgj-B24EH`Hb}@5Cc5HYWm+xEl23My3eq
z)_Qt!TUFTjrOjB(0%)d$2?1Nt*`S#fpP<!xdQ@!5%J5B33-AoBGaA`y4dZaWrJ=Ba
zXOh^r6Du}6$E|CtZ}WIXT`)Bs=be+POuL#D<H2RGV~ry=S1G{Jvw}&k;Zw(Ja{-?X
z%c-SFb^@Q=niW&gXi&69>Yg3?huPgHWobmjOr0IGdpX^ZX3mCjJecCr_aZV(3@7$4
zy)72AwfiKOj?$v!H&vlXOgv7gsKaxwaa#cxDZ4^#0X-0li>Fqw0;VmEJb-7#8Z<{X
zmw{rni9#@RR5;ge<HnHp;t3S2k1-3Tz;`=DJfBpQ!y`LHI8l#(FkW)7<ZWww#X`r!
zw?JoV2S~Fal*&LYosGt-YzuaT!ZOBEoXg_Sw+usHN6^q)der1863u2XFNf`fJg5!V
zQk_rt>z>IyRK)aHExqLWKjlGY)<yyIxrLafW^tus?Y*c@XPZKHhbCzh)6LjCV(-Yi
zu31m_uuJX;cq-dNn|XPsi=`bD^6Je#F-^_#9;+Og>RM=?g|A{^!W4{REkNICYw@><
zX9g(`rLwW&u`G?a*&5y&LN9I##^T*UtVvd_(#uOqvp2fP<&_n+>C1LfdKP=D8sD?r
zI7`)2emIZZ`4%BGc+2FR(c1<`8aElE^|*Xmm1XVKpo%iH5o;^VeNhDOY%cLFE7O-S
z)J~4Dba}wsvoxptSsFS*y|fmVektay;%hV3qTKwB30lPr&XKl{;R_Ggu!RzT?_(g^
zbx{GD9zirBN;iqGndlwe_#%%~LJrcNs`1;z+1w<y?-0Y#v{}}Sk|_*j2tv(9IQZUp
zJ)G%(WZ2Gf@3OWxQ>UyHic+`Hw367iZo}UOXhbFKRCly+i}q+GSk(nf@J-*6o+vie
zqQ*8Gsx2PsYF`{`XH?C~y2XaSLlf+s{#d70(!MQ>A|y1TlCmWjjp6%YDG41k__Cie
z@Lf-rp_R~sdaVSvB^?pIi3I~%3HI5M6cM{58WCT5DG7G+7ot1cAx))FOdz21UJB}n
zflEl0-<6g!)IAFKcZVQe3d90JE5Y})yYb!Yly7`bDuJK(XfAwmIJbK5hJxxnG)g$}
z+AsRP|8#s*{;FI_RnRSy=3n*uGlig^TiEk&;QR`z@9!)AJaxmc&XHfe??XYigWs9|
zdeCuXLqzGX-mjw||17|Kzm8LXKk}c2kIFy7`YX7J(^N;YoyxC%pAiC%SX6%XJ|6|u
z`+LA5E~ou(0hZp4qw=fwktpcneA5^zJ_SDkJiTj2rPX^&6x_@CNPp5(>Cb3w05@b)
zm0!KzM8WN>uqt2WSM|T2^RHxq>OCk5s`n33{tw{e%>M=8s0bxrb0a~QTl~c@o>DJo
z`#%J-J-^#6`4v?9D=9^Mqu^JNw&z#xUr|u;Wy-(bkzc*1MM3)+8ze)?Q}9toe)aoC
z1us^Hrp&lhKAFO~GPtqpuipEj;Mt0iq0@d(aDLVPS~W8=%p>8pOSPXu{S-IEqVlWv
z$SAm55jcah{O29{)%#`?+;t{rWZ*3SWk-JXUK$0RA=X*`UyvU^trmTQics$b7~V*!
z@%C2bE0~MQ({H;tOqlN*ILi4IpAd#WDy`s5<R{-(`PKVylB#^RAN60lR6gYgUk8rt
zukx$+f1Ug&af6?TwoBz#uo!fEe)IQ{n`9=3`<dk{k>?{b$x`K4_i(;V5^i_2f3-xL
ze;081r6xw+6mOCcKc`B$NLRY-&{2e-`!)%nZQ>vB(eL$ie9U@FvUsnV(w$KP?2F6P
g57ZC4n5fECy!iRLS$Bma82F-0(U~>&%aPRn8}@h(vH$=8

literal 0
HcmV?d00001

diff --git a/test_scripts/nccl/static_nccl.sh b/test_scripts/nccl/static_nccl.sh
new file mode 100755
index 0000000..761ff36
--- /dev/null
+++ b/test_scripts/nccl/static_nccl.sh
@@ -0,0 +1,28 @@
+g++ static_nccl.cpp -o static_nccl.out
+g++ static_time.cpp -o static_time.out
+
+export DATE=221221
+
+for cards in 2 4 8
+do
+  export RES_DIR="test_result_${DATE}_"$cards"cards"
+  export OUTPUT_BW_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards.txt" 
+  export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards_time.txt" 
+  echo  $(date +%F%n%T)>>$OUTPUT_BW_PATH
+  echo  $(date +%F%n%T)>>$OUTPUT_TIME_PATH
+  for n in 16
+  do
+    for w in  2 
+    do
+    for m in 1
+      do
+        for iter in 1 2 3    
+        do
+          export INPUT_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
+          ./static_nccl.out $INPUT_PATH $OUTPUT_BW_PATH   $cards 
+          ./static_time.out $INPUT_PATH $OUTPUT_TIME_PATH   $cards 
+        done 
+      done
+    done
+  done 
+done
\ No newline at end of file
diff --git a/test_scripts/nccl/static_time.cpp b/test_scripts/nccl/static_time.cpp
new file mode 100644
index 0000000..444446b
--- /dev/null
+++ b/test_scripts/nccl/static_time.cpp
@@ -0,0 +1,37 @@
+#include"bits/stdc++.h"
+#include <string>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    cout << argv[1]<<" nccl : "<<endl;
+    int ranks = *(argv[3]) - '0';
+    string str;
+    stringstream ss;
+    vector<string> a;
+    vector<string> b;
+    string line;
+    // time
+    getline(cin,line);
+
+    for(int t =0;t < 25;t++){
+        for(int i = 0;i < (7+ranks);i++)
+            getline(cin,line);
+        
+        for(int i =0;i < 5;i++)
+            cin >> str;
+
+        a.push_back(str);
+       
+        for(int i = 0;i < 4;i++)
+            getline(cin,line);        
+        
+    }
+    cout<<"nccl test time:"<<endl;
+    for(int i=0;i<a.size();i++)
+        cout << a[i] <<endl;
+    cout <<"**********"<<endl;
+    
+    cout<<endl<<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/nccl/static_time.out b/test_scripts/nccl/static_time.out
new file mode 100755
index 0000000000000000000000000000000000000000..5c1066ab78bfff9c4b76d8baaa707a3427a121d8
GIT binary patch
literal 43920
zcmeHwe|%KcweOkyfT);63O0&AM)3zK#!N^;P^k%IV4?}6`GHi?!(=ifQ<IrEa{{4H
zD!xGL7}BDbT0SkWrPXV>SKDjvwLY{|gP`HQ*V@?URqBtHw*HuiwlUQfTQv8(_Fns(
zGbb~_rm_FL9GI-L*IIk+wbx#I?H_06?AyIH%Q9RpO<kGVXEaK==SoZk6Sm*SHvkn{
zxppf4y0vq)X-H=YoTOJs0IIwWXC%WM#%BUjz5=>Q0nd_XO+lq0LCTlzI+8Cb3Tm9D
zeAz^D(;r-J@z;b~&y;Be?fFD`70mB|x8+K_g7$o5o53}bKG>cNeO&Khp%iGWCz*@#
zB{*M#^C_6m_Eb>WlXRp{Im>lIH)wRZ#a~Tx=3CAAoUn@d6{IrgYIxmfnMiqcobSB~
zAx3k;ZCpPERe7I9KC;Wlz1W;uJ(riR9g4W#3X-4Dbwzuqxvb=h_Lj@rL*cIO%e%|V
zE-x!7j718I1)cPwGN}2MuV~Vsc0b`*oM?+#oF@AF@1B!)<2Q1Dbl~Mxn;L!heQ@(j
z4|<dgDwBBVqWo+EGkq@Zh)34{hZ%RcW})Y4+WGj*#b@5x+s+ycKHCwycX~!^lRtm<
z&KXOAPI+Ap;WJZE{)z%p;b*1ce=rUHv@~+4cB$moq~WiHAyV<j)9`;e4gddykyFWu
zrP1f2H1!%vgFi1#xo@S>^WHT2+>!?Wn>6xYOr!rbY4|TrgMTg!{@yh8CBJjy<GkwA
z=>PpRc<MWDrx2vCN`wDJ8hd^f@=w#|Xce7On2KxP0{;9I^8XWZh=2F3l30K=UAM8E
zI)|JA*oXKxGd`dBHN=OQ(Gs}qGKvXhny$BYM8bN^@J9_@*YxVfTD>J04Yr12Mljk~
zyR<zL4mSFm+k=vpnsP&fQCy_!Mq4zpL2nO+Ta7k77>!1vb=L(n9n|(%b)doUcmi$y
zsBT34Ap;0+gCBpq^&l*0_Qyg2JrXmbL4QYequyAgdm9$&_1CRvFc#{1pu5}S@f1rg
zFonXc)eR+jL#f{A(Y;=jk)AJY!Am{f^+LOnV7R3{m4f5wRt95Du+Wvz+>yE5UNbhr
ziS=F(2!*Af@=zqELq@QqIvg^pWW`oAMA~ECx@GkZkp`V?1Vh7?vXz?sExNzGJraNk
zU|zr8j5ZJJ0lyJw18YlrowUuu)}Vo*B$&!PW7-c+<7mJ1&63o7dTV#L-WiO>B4K}f
z$k?c_FLGEYx$VcZP5pJ!zQXIqI$f$Z4K%{rN_FY2;SC+Bj4HiFm?|0+Hk_D-X5(0X
z{WrBf=Wj4d0+B9*8xIa_HeG`e{dm1?qOFAx(i#ZpF^V7_O>YWs2%-4;V63YnNbx10
zLJOiw;1)g5c8k8wA8OYScgSCb3H?T}J7f@wDzPpajC2MSZC!|*;0uU2I!WW;)Zs_w
z<x7|93kr+0<u%nymabln`;x-qG7YztOR9BG;Q~9weqUJZkx9L{u*AG8E-Xb1&BUxH
zTl~$!-%QL`GBKASto)auO&9oV!ACQv9G>fB0cHS8A9=Yj<EcVIjT3bjZ9IkepJWCX
zI4Kkpy$+xC$G(mcYnnE|dH&zO-GTzLwHG*TTyhVFhAizjoW51?XKL@5@-j3((@29^
zq0p%s^gYhFt}{a?(adZb&)3y_UX3@}1(ZwE4)c6n;pfeh_#+N{|6GaB;c-RrtMs9W
zq|=<wEpNr&K^vY`5%|M4{24a>BR2dT8$Myft2Gvphi&+?ZT!b<_;YOdeAb`ZqQHh%
z<1y(-a|U&(@s;rQ@mY<pglE%;ENYx4{CNT+|Kzhi(~+dI)m31_!y(MeZNp<oH?Mg%
zJiCIR71{9ibw{}kPi3pC!iHCENi?4g4+k=@Ivf6E6RT;fZFo4md9AhKXP8(`YqR0u
zu;$fi!}AhV(7J8-Jd<0~Hrw#jr_^<)4S$+K0&chA?epuMHvFe-{JU*<S{JEnj}1Rt
zApsw@;ZL{W_uBBEw&DA2cwWles8hF77MQZYlm(_NFlB)$3rtyH$^ugsn6ki>1*R-8
zWr6>{E%3*@bN=G%c{|6~oBhj`*sj>xZ)A)P`g#uL91w;XEq!T)rj3kV_%cA=xf=1e
z5oKiPH>0DY+eMl-U`B?XGt;#3GBUK^Ow)$T$j}eXG;Oqu41LE;)8@{|(A{R5He5!A
z?l9A|(K0f0tC^+^mXRU9nWl}Ek)h9;Y1&X38LBkXw2?A0<T2B<fig05o|&eNlaZm*
z%rtG7j10NVbe>4R`;n?IZIGb;W|}reP=7N`8zQK`nWl{p)Za|g1_<hJrfK5?^*7VB
z;eq<AG^~HNuzr;$?2mL)EwzS~u63lnj`U(jy3~=r+>yS}kv`jzp6y7V>PTlh(jV3&
z>-)YV{f;C3x+DFnBmG~F^v@jWryc2^IMP3Iq`&V-|Fb>qd8a!5+|9oDZ+$(l4c9f+
z^ktV}>&e%5>XT?FU*F>WXs*%O_aMuA+bM|~Utjj)1Qs1LW}|C`=^kC{q|qaJ=WYhO
zc7P%H?-u-}cM-VcBVT;jxBvIo`1T*m^tqn#J@=6@3j(|>AZPT*I??VbKlQ)Oi{F8J
zXkC{z`Fa+QT!&j<{B>i7uXpi9$d-8Q!_m=13!M3x?5Dx!x(RvB@+tq&1`z3{3Hcg)
zcf9RG2af+_AM*L)KYAFT#n+p4;XEPOoAC7edS}lfg?hIV(bv<D@PR6q>exy4#Rn40
z;kn>B0#z)Ye95zzdFBe9BmwbfV2z{5lz4%}_LY8$c?X0hPr(jnlSuHzUrhXhkfsI&
zke>IbmiI(|PogNk^%WsttD6deLT||AT25XplSc@E`U}J-U*A@ZsRv}Tf|LI$nPrkR
z43SZd<D;VkTMvN-t7ttxp}T^-uTsz)*!ojO_X<6F`ll@tsN2s)DVCh`NX~r7=@C@d
z(ZPYOgQh^b%Mv-<LJ=zY4`iE@LFf^TuA_&2eP0vgzOAw{FHn8T5MKwjK5vSmyG%#n
z_mil9`zwheGDP)=+PH>_;0&<+ZtS9>$Tio3Z#k6o#mWBvtJ<`Gy|BPxE~R%X-T4C0
zh*cR#6haT*q62yN^l#}m&{UFR(Bu%x(zjr!Mf(=N3)zW%P)Qb5Y!&sU(37&ej@}F1
zW=Y-pDZ_(i2A7rLQ9|}+|G}qZe~nmoo2)M|0dCYwNYyL@iQ|6}NsSx-KJ)}r-+{yq
znOw`%KS*ZTKm&<>B=?d#_<Cy-ExrXaM3*r-j;^E5<%=J%m9rzcoU4+{nV($Fm%)5=
zx+O#vxRQj#4^x|0p&E%_Q~HjB)O~Nf>456d7~b>an~ul3j`eIh?&>POW79G6+OA7r
z^GB(96GE5SA1tT3mfi;xZ1*yyAb1Xl+yB!P_5@{pk_P<5E;x#<l=!A&zIfNLuh%;~
z7Q1~ciWq&phzj0g6d4vh6`Mw=qgjNx>42D7P(UK}eI%S1q0qNbHWW%ruRj6(Ak?;1
zNQjntqX<$5T8&l5ZcMv2dteHs**@Hlr`faP(R=GmQ(W?tFFuF?AX{_<G7sV_RsLD3
z|4v(phvA6hmG}Vh#?f&}h)+sE<)ol*+609hMo~654O7RNEAr7&h9_BK(=dx%uSE7%
z<oNpP+*0kjoTKN<yMe?#LZkh@UgWXrG!q`1R1ekOy{cRcjmj!^->{qx6+}Zo>i*wm
z`&@$#e?w6=jH;bbU?6cPLQu-m2jEKFyaN?9>6A@9P$8a>IeZ#PyHC1J$BH*4d_9|n
zHRIFLYDo6hDvQlYQ+ATJ$+Aq-iTF7~3b6U7gWH=RugMc(7M_5TExU7Or``x&)_8x6
z#(N0qLsRsjbHqilK9VQ;O6@T#)MY2u{AqO5iWby&9kLQMxkW=!u30ibW+yI6%C#3h
zqS>t305NSa=N`nD-M)C!PP<4k9r5n*_02vB)9NQEE7kY^&>$Z7lInZNGveMu@)nL6
z_YR7Cu>F%Ti}zt)UlTrFNEfv`B+-XumZpA2zc0~8fIb2#djWS+VgF9yI!@i}uy6kn
zbX)j0I9vpvwCJ`s(wH&e72`wV6(lew-YV$>iH~UPr160?7)YFr1(C0}2v(w^4*BB0
zB<v4l>3e-W-o0*>^I&q$-w}`K2vDwTkaHTG^Jqk6>r2k|kjUmNa8IfN8<KNIQx+KJ
zoQXezyJy&ix>5vyEd~-Zg#hwIY6VnIm?xL8qOF?HUiJCD%bE`P_WvatY>MUEGz*a|
zuzo(@?)Ak*jOdq^r$&I~381vP{O!1*u~<%D^R|2q{xQ^xKXNo3Nc`UsNbT9QSL>P!
zR&Kv!DZ(ZG2!|43OjM#)&4_Br380RsM75=1KNMH!H)w2*{ZY=b(aec2<DSRuZMaQb
zOzED3ZT3MMk<A`pdfs*?1(aTnB356%4GJQP*#bennIk!5Y;<sPu(kEL%BIRj>UHxV
z;V^m~90|=KT5BLd188;pX<zS6K>zAVH;kG1)1L`<uqOWdn)sWEFTacV%Gsm|@x^~d
zxVOMj6Hkx_>=Y(HIPb>FHI+BnO10FrW(M(mdt4=*d|*2So%*5E-m#GYq=CfqqC*}~
z8vpYLu<s=YeDRln>Bj&jr=(aRp$~)o&0)~Fb{mNoU6GTY7fJX<AEH1XpbQXCpf=Rc
zTtJ}tnRziWFY+FV&QzG|ND*Mx=PJx1V8k2;x$LvV?8JGn8oa<~_X64MW8*Pe--^Zi
zD72)A#e=Ap<q%}M0@w~s&_x;p4kZ2qjpyqvKoZV6khotYG1kIX=n9yD^c*Zm*2i)x
zF{WX4OLGgF0f=t083Ficwuo|8(I~mdEIHl;o7E25TTToy6b?Y@wUle7xkB$I8Y!+;
zBgOGQ*y|)b1`1fCrSe)?OD;=QZ#fARZ7V8YE|LQ%mLd_fdX9p4{8x!vh|mKEwdA4-
zR?U;UQw-e69naBKY0w-<jQl=%-kBs2HoVF^Y|IhX1oIN9T7teFk9dE~ee~s|8H&m*
zx<7^nX1^LR$?P_r<-q?qS{ODE;p#3Yce37b1TYF>BGhyZE7iV{D%S}-lxDuj4;jg`
zg+kaQN#7J2j0tq#77EzsEVf!oTbI3)Y-=;ZjfswIz-&hb>9B{LPP!MNkFHGg`h@fu
zEWN05aTMH#UeHJVO7w&o6x3-bPOkD*D9OHhNu^q{6<EdN)NqTu!+Bq)<#cu;<8AKs
zV#a|<2-21<W&9>V&Yl22A4t&57V#QYI6_J4ueOfyFCvMKfr(Ole+@Ri&?z1uWY0l2
zqIT+-&0&B7w$02%b`_rSj=E$oC%+<72}j%y(U$6FSL2Ffv;x&?Z?c%#5Vo?$SdwBv
zIf_S(c?65s_M_NCX6s$$9W<^6M%d(8p4o6{d{QK73fC{gf-nB_#Go*O_plqCGT}b@
zxTr+W!E#%q7+b0;u@7;^8bXdh6dH1oL=7a|!i{L)fmsF;6{7fG(d`_fh>Qb?7o-9x
z`(ffddKH<Pm@?lYcc9Kf^%EZWE(WQ@UxAlA2NHQl!6WwzX3=f3;;<I^%Q0AM7-K6*
z0>l_W!*!e{Y-nM#&7LJim}7^U4VljWbn>o&*`#7UiEbjARw#J@axg=*H9xl_^&N8w
znNK;!wl#Pxq;r51p8XwCQ-%Cz<Yc89V+cH&>h@s*fnw|+=d^=pkySzz*@5*JzS{t-
zE&4l^5IO&csf5s}os!rW+tKeV*>qvk+ezL$dl!Nsgts0-7?Kr(Os-fb!YNnm0<Kun
z7-m_P_JB7-4KWR_yuPMy@quDZt6Wtbf1$FeI{toTV`cn!ldtdceLy$VT=F5*$VvSC
zb;#fUVWx4e=Vi`c6Cc5(uPXj#<>;(m`+5$ze2ZS~dJFZ+x$!2N-c{aQsUKLkt}yT1
zEmWK+kD6S#Bt3UBpEL>b)SoE}Oj%&c0#g?F2U);{-?XSp-C?>Ru-L@GFK(>g;8fs#
zGd|)67sN~qxK7CLWU}R<$sLKIuK>_v|LOQpnJ3{xkMU2zX9hlE+KoR^&9Aw`fk3<a
zO1CDU5sVpbBh(SRQoGc=wDrM&5s6-TrCzJ!mvr>|JUtYS1)~OJx#nbEbtm+%!~OhU
zkB;uc{X<7aM_&Wn2Y4KC{%=P|?+2XyyV242pyZoxjE;6fk&-ttVF0WJoCSOUumo^D
zU?boIfCk_*fI9%+0DJ^68{_d=sAvh`)qraO*8pw?ybbUdfV%<D92y-x2>1=a*8raa
z%z|;=0K5S37+@7(0j6~=fY$<U0lWe5e!v}o2LT@dd=2n<z$_ScCgyt=05$_w0qz8(
zAJp#w+yb}@Q-_BD9|Zg<;H!Xd15U?WX$BhQOu+eo+W}Vt{yShd;CHddunX|_fcpT8
zv5D~tU<mLy;8TESp+TPqyc%#mCX;IcR{?GY+yuBA@NvL?z$1W10CTZ=A}6Q<bOUY&
ztN?rla4p~v;AX(T0`3Mp^F8DT+yZz6@LoW2pjQChfK~6K-hj=3Er4GK+yeM0;QfHl
z0v-fB0{9x>D(pz-!1?kphb{vAJYXH*y?~v7j{|N8{5jymfS<-QwL!o}z+u3<0rTM}
z_X8FI4g=N!&cGJYBY?L79tTvvg>ikMUUPNlyXMT8p0gbtitu>)t!;*XUID08O&1Rx
zF2HRGK5eg{{?zeu^OxloT$^{whMdjXHD_LV>4FQ+2Ug@?1K9s6^dJOZB-g-aFYuj|
zpD!wF8$O=bMn`FfUi0SWe<@?>sncqq5RycH2%n2V7r*OK^lL5pPeDHubfK@Jue0cH
zgMKOK;`d;RzRIG{fIj2V=bwNM@tW%kbv440)4y4InWoKA_9VN}=M31Bp7&?v!AX=3
zzMjm?{h2}$s^<=r^BeU0boF=y^kL8;-;&X6)#G{4kAt47o^E6zI}-h~fLY&xb(J0&
zJ|Tp7XrW2A@Da})HlEw9a;`>vS^=Kh!DGo-Z`NZC=sy7cVmtk2EB|evKMFe8#ms-B
zMc)Pb<Dk>mnM``MMc)VdL!hUsC)x28reg_V<qui;kAwaM^5@y<%Pjg?XeZKtk)7UY
z(XR&m&^Y?70iE<OwC4|6`Dw?T@-Md24U4`D^fy4Kal)*B#&W?x_1g#f=~xn#+36it
z{#QVs4|=Nlk<A}L{m%mZJ;1E{5Nwqnc;Sz4WSHe#kcAy~j3cS!kRJ3q*%v`iXXh5s
zhd{s7uBYG9a|`Hy1U*$d&qJOyWCzf@aGy2Tre}3u=C-V!?2LdVYY?)EFqWNXFKe}=
zyMY@9y%_XiJ011S_^l;l7RIfQz*A%A$yh2FsJ<njUrOVkoemk~`9yC7eFf+X?ewrE
z-vB)b`seKQj5<^P4$!{^dMbOkk>?T6w}Zago}cPqi-m{5a}Ye~Y<>*%XF*TbHw$3q
z{{sDLdl@mS4&|VafPRskev?IC4LZ#g()oTj=+iJaNawpdK|c%hbZxK~bRX!Y_VU}U
z@(+W4Gw5^e^cyVtG0^>>r?N4%VFBh24WOrsE#;usgC4TW&sZx8p!_sP*)@)Qn!9`j
z^kw$^o1}hj;C6!k2<WYL`tog=Jy{vo6Bm_1^POYh5x<94eYeS~Bh7>U1bR9f(fsHX
z%)ip*r+L%KpcmM6SZT?p`BVnzsch&*o;9Gqhw|6k^OK*pNf{RaPj*}e8IR#Us{w&X
z)r0sRw9488SucX`M!T$xdci=pI0X7k>_E@A(`|7f0s1+hQ<-KNwy`D$!}0l`&$iQR
ztn%l9{%O#Cb~<$Y0$8YgAL!SBo-UTRfxgB`NB#)szZ3M0pr^7W$=?HdJLpOM0r|hk
z`454<6ZCZYCqR$9G&+j!#ai{xs1g-)1C@h0=PuC2Z}(Lj$dE)iG>Uce!BdAaf`nL>
zoB!2}>fC~_WO{SmeOcbzdE2t9a*Mt+&6iu=Gktk(ML4&-GPkHQcixg*_mbR#CAs-a
za&yG{5)8=ch8%<-^TJo_PQUK+cy{N?{_b?`2N}<1zB)~NVmeqr5g*)r^Fr60t8UP=
zpSng|uV%T9X3oP8wX@HAE?fIuHaIcu5f^ggEW<Z|gOvN5XSnjN$ll<(+jTO&aP)nb
zjn6E6>SoFUQx=%Az?221EHGt(DGN+lV9Ekh7MQZYlm(_N@V8n(Y%t)R$XJ@vrQWlr
z)}~HKd#7|MJiP;ut{D<&np@tS%IWSa<&9XU<5s-~a1PVevlje9O<d}|f3$}{7q&sf
z_5Mer5xUupy;ZufEEX5-$IylCF>xJcI+pC>n#qiK#3U}UH-tN}@4z?$oJ~6xmPqXE
zh-)6-&~qTVxHwI{OKop%GQONEuJjhqoWO$Z3~{OQ#U2(giVt6T7xjZ(1qxqL7U`XA
zSUk!VX{9F~AByy~OvmFfk>=*$(k2h6_o+K27P%#+f?*xQwG2BMZf3Zh;ckWxGwf$L
z$nXfmVTS5uhWY1d;&L-AVpzejj^SE{oeVcK+|E!+nfPDz-bp0^F9^NTJ+G;`D{OSR
z7vb%oMVEWJM9TBU;-bQ$l0wfV60dTkhneVI1*)NKZ1i@}@|5^YO}&TS#txs((r!!1
zpKbP|Wd3QI>OaZ&>0;m0j-&fbxT^9$J1rC5uKYF`f3l|fM>75tP4%N>Je=EY$H@Ck
zoTlJTiO<z`r)b|yEl+zmB|hIgx4~XYCf;DJ;#M*qyPXwwjJ(g(K4rd_-%icMQ{~M|
z?HI*BTgzFI3V*s8AMMmkL>m<slko*ws(zWNongkYWPU{1PCG{4XWHHeZl`9Nr%6~?
z7jCn(cSkLZyrVFbf!)%&xycj?kA&asz*BP&UX3>@7d?Y=L7%-*iJ|i7qUR2-6n5wU
zp5&<a$r3MJ8}V^zW9_*U_*DAu5&Sc?oC@6FQso{H_cOJVDhO-g4~zSm+Q}7!weYWt
z`#IWU4*lN~_?g;_3f$mQahkrX?7|!O^D6|}EcbH8pXR_<Fn*>3U(fhij92n+Wqg4H
ze>daLa^U}i@oop6p3#$@7dY@AFn*o`Ux0Cw_~$$DS2DiHfxn*dB@TQS<I5d*`eixE
zzuJNSG2<&7_}?+U%7H&6Q_Ar<@Cz7U<G|m@_&Nu^hw+UL`~!?%?ZE$>@oOCTzc7BS
z1AieJl<M2!z^`C@n*$$Xe7gg`i}9TfJpF=`<Qon=J%1*Aw*!B=crGzhyUl@L%=pa?
zJU!bWIa?g~UdG?)!2gi(+Z^~88Nc0urwJg*-{HVt#Q2>Kd=2AwIq>TlzuSTT4&(24
z;0GALhw*AY_YUwBmtMYD6PI|7g}bvcfh4?|Z>xDRx>3rw<N?16<4D#DDaZadRUErU
zOi;&;M>Ef-a>tIR^+NvGapXoJ=kzh-PCMh(JYMN@yO1+>-1!>t=cB&gzCv;qbA}Ir
zPjwE(X)qx14~mDtn%2tv2IFsCAQ4wH{srJ?^1U6b#DtsVG+rXH;<-3(zsmT>=OyD`
zWBd&9B_(|4f`!$bEakj?X)^yd;PbUJwF1X@`cKS1kIQ9o+MgJI7uRbU%ij&f+>qlO
zm#@cyh4kN2BpJ_R{+|MW8r8T$BG5g>^)mB+!%?r#pvhC&rv-SD|0q{f@%%Ck|DOdv
z7iI&W!g@#LwV^fo?juGu?W7ANmcBzqmp={vzW`7ATr0kWqG^vwc5Okf#OJd8Z)bcT
z@TtnBdE{vsXKK#*`O_@t4<%B7@`qFNxLo$Di&=h@@lN}{34E$@OE3i?{maXwfOA;R
zmw->jKbVI9{L`eIor|P^8kVyI`14RN_5OIcy|^9*K9zhzl0FB@C7+M^S26w>_D_XB
zgo@A17(4I1pZPz}e#PBbdmMQDA9J0IFFR7X53t`3Ge;Zao&9kS<8zt6i}}yOf}7<0
zfZOFF#zz_dDz~H3VL#)yFrK@aRykY78TB50Ww%D)N&c$&QZP50_Cv-y<HNMmQ`gs>
z2EPM%lJATs#~FW9u`Ks<T(4$qC=kEF?eba1|8K@0;y@z4uK`*iHWF0bnN@%#z>}Pp
zxW6c#`{D4^AKh~$w`#}lG5;y-Z%Y1IXQY<zPlGqo;O|L;{~qw9=kwgYg<OdoghIk^
zX9dyS#HBOd8Q-1)Ue%FV1^5;46bGE~`7M@n5!;``jdm?Iq^R6o><{!EL%M#=cxOCc
zf(<I-FK2&K{d6DW|H}F+-7Y*^@~_}_TqMP7y}&CTMhWl{@TtyGnvK9h<$i<ZD7)Rl
z_`A5jbJ*2>%=m3wFJ;e#@CTCf1LlX@i|bCtf8glv-vXY-JLfpnia4K-^;&<F1u%F}
z_**iLQtz{TXZ%^}mioNJ<$_IIcLSfwJ`Vy<`fp`JDxR4bC<&jz?NTPiXkQ1O^mO{e
zea!D1-+u?ZlF1?j$iX~{`jK{}#0HsI06djj$@-(4i_6XU+)|kiF@6){4;4$qYR3PG
z@lUbhN(V3cEy;h5{ffJ{wjFrI!y*KDC=LE+EazU2<W_!rE*ysBce9>K=ADe6#{N^s
zg&b!5^Q`B3#xK55%6X03Mfug;jPGTBH9n3o{<o}870dY&0zT>U8TNDKx32>4W*$5E
zfcgKG<s+<#Yc-8)c1FJM1fJwO{p|_CpRaXt`(`u$v&`>|&vVh<B<FGVH<(vkcQW3`
zc2IHkH;i}2w{paNlJgaASQIC&yBP20IH}rw1o%|vvgLq*<UGoZXeM#3LOPWlx|lzc
z<AC_SB67S8yjx~X`ui61uVOp1yJ$C}ov7UF+5a)Di>n2AH{aXA7n%QNE|=X)dy(;X
z0@9^)z6J~=-{}wc0<U;v4*na68Zn~_zoZRl*12-J(V++EWVsm5cxs90t?iL!f4km-
zpFYKOe^<8_h;($e2aRA$VR=y*&X`KYq~n~ZknWF0{TuaQ*obb#F;f1Hpx)Be(XkO(
zY$^PPDr6+F;%F)Z2hP#&uL=X5ojA&?!Km>p>I?+~!408UP!B}HILxdofD^U^X@U4L
zmLD2IPOC(gg<8XrD9))Oj&fZOMW`GkS1iK0df_fQwyvhw1MP$B{9Wyaj-PYgqIX8x
zLxGL*JhmE75op1vpm0eZR7E!;c2tqSCDJLglXM(vR*Hl8g56R}f7poCl!gL0{*9!j
zB$tIFv99J=T6#I;hdOZRR$7jQfk=Bhj_pdz;3@Jht*vYbL_?iMTBcHQ&|O+`d9d5S
zS!ThOv>Xfk8*WKUENP7fJJV82gX5At#X7ak#&Dz~<Zn+a#j~J;&d5`3dP2T%#L$}~
zCzeNi!H>z;9NF-($_<Xgw@6mQ9~Q2d(rG*;l800aceRgGhcd~CPGjKM*+^G<NAoai
zXEeB8_pisfaOrDOvMv&Z;1ip=h#6z!IF3h23rEmxI>)K?LYB5J6pa~rB-}m@?*ehm
zB?O+JZ+Mu`*w{Hv4=R>i!C1gQ9vc_hjMMd^!FB2F$AvaW!FWQhPJc8QJ|P#$8m9s!
zvI00<@&t^mTGStI9Y-8<#X83=wA3sVy+7zrAACGz+zMiJ!TE#ZG^{F`jxc1aAi}o<
z4R|aD3cUei%6c4A*h0q>8bLiILRJZaJr6B36d;a-zJP%SC#?E$L}Ckm@=HSsrHeu#
zO<!KKa!F+kvUD};)w($0P}i5$SJrxU?}{oqk5$J3%evRcNMBXGrdNG#MP+UEQqYn{
zwU%TAw7`#sRS7GXEo<=NRMpBQHC{e@)Kg}SXnZ0o&ht!vUT4}PH@#jjjbbb<i1Rhe
z^g5iT*}<nfR#b@78gcL<9k(b>Aw{H(1~C|t;r!Kg6DbshKwT8$_2R79M!f+|Qm)tI
zI9MJ6i}flpcVr_?D5`6ki@aWp%AyG(8z)w#*SlU6Br5TbI2e^prw+QrL<pn%1RJTU
zSe$lCfxcK@C4OcuPK=#UlPlBn5xSEr6o=HZej&^P^%lS3H_Mq|<(G=e2T7Fa9+PDQ
z%bcXH($ty(dKQF(8{}ELG<}?qy}jjT_v)y-j!x#aO)=?EU(7>0FoO=^&}LD{#QKy`
zCf`3$&AqD(y~~Rl;Yi(u=vUP%5!`TwtKNly_7k?FdCYC9Ba4&z1u1f!&D@`?HA?7|
zW$H=JLz62iY^TslsyfKl8388{?_?vC=t(uFLv1OZPVD^ZBvG<8=@9R(usL!Cai}n6
zN-}tQ7ATrTB91KRh^)uR_jmc67lB)a@TGdg5)R+W5hq|OI#yha3yV6tVr__V7{KAc
zr6hTrbIKj&&|6{=y$z!_oqUdb<!U&YSpQvL*C=Xc^V7))3rPcS9eS-ey4qIAC)*hi
zRyj2M9Y>)j8D6?HjVF`hTu=~3k<JZMq&h}E?YxE}&?l;!N9ks3ugP$VMdscG#qn$L
zK@=wktm<5exsq0OeIb?&h1jAv(V^>q)4}94@;U5PCc2KNNMEXUQzq1Vt`*(cRs*rg
zF&QC(Cq)g$f{swQx&z*f30^V%clgZTWQla1b6tZu!2UgP4HJ%9oT?tgUQ#=jY?u%P
zgtil$8}UeMpHO@PUWhp1^g|lAC&wb?9sX`TM!Q!uHA3*fGR)u75)H;;k#!StIWM{k
zEo6kaVysuj4vSd1Ojek}?24=-7c!N!mZ`@ehP8~B)=;4^cPn!V1@D;*@As5q8H>4`
zxfuH-RtRF|EhbA??P1OBtV@H@9;;3pgzL$~V$V>IU^N%?cT{5;Tcmqovij@N_o9Xp
z9YKRE=yk*-xrmy~MNq2sin-XDly>r<oz#NaOKWQ^b!oLL9TZlu)*v)Josh??S(9y@
zIEgz1_A#kczgEo0XjBrL1?I@4-_X)tmpq7w4p>4H&FcE}&7IbNyxx^$p0a3A%vCY&
z1+mQ)2*QiJC1NvfB@8LTC%7hWN5oNIF@TM&t?+%Q=Lxj=qdM=x1iaWn8RPW<C=4lx
z^-2B=&BGfyoTwEIk@lFkZdrXpq(M*a0C-3ejC2N5g_OyxzqMYa7Iw}`Cu4lU)K08}
zC)N3_ene||*%fF*4?}IB?G}9<Hvg>nRf0ueW9?G4RVN<2Oy1TC#;{toYY_Ab3r<qh
zUpVIFg4I@sv`+(4EgmTNEJReLxuohhlUQj}l-Ns=O^n$Y2H+0=EkRu_IM-pjBiJ1X
z;*prS_?$>=pkC%!!itWH@kK7z8Vq_Cu)!Z~sUB~z887-^8j(5xA?$Dj;^<>m{qGLp
zSgO}zr_<arlDn}qF^Yb?p*A#Ts7-I)@kd&0P6Q%dYS5P(nIfcH>*<MYRbk_oHe)dh
zpqUaT1Z+iTgQiz}oL1}XRIz1JhHrXWfM;lJ(Z~jC7>DyM422Cmlf=HASh3+bZgq1_
zv&Spyf~oNs@0?I&TGXr<4=%eMYaFq;N&$|Z6-;mqpFCch3;4;foLZVpPT-SUvtlY5
z4T{!C-m^phFuVJNERBemuCrryFQ*&Q%#&dp52m>Ey@)gu!}0w~Z;r)m?LNV!qqHdb
zO?fC16OR*0s`1=w%vJzK%JxumKo7*a#8WF+0n?T`9>6EX8Z<{Xmw{rni9#@RlsMOJ
zW5$qA;t3S2k1-3Tz;`=DJfBdM!y`LHI8l$kGhTAA<ZWww#X`r!w?JER2S~9Yl*~Xa
zJsFKv+8k^Rg=LJTI5&wy-y#fstwBR?>{OGdNOUrT`2yHZ$b;H&E!FvSzwVjXLq$xF
z)zV9@|C1hMPTDA7KDQ9l)Ja_FSbHz3)7hqw9ia&t#dI?^kJvl%u4vFxJ?xS@0-n;A
z&^lh;>0)UIg}i!$PfSxMd5=|&Ox2Av&%#%+FkuQtu@<23v^Dyh#WRDXhf<TV;;}4^
zxY-om5JE4m55~GWf>@I*U8*lAESkL0MJ}(bs7+tClhP-#x2o}dk{f5Cddd&ykvrca
zga&V&m@|5t;Yed9W3(QZPpc+bdo`${%*lwgCFZ^;g7;)D@hvOUmoU^$j<EECfVpRB
zPWdNk=m>SwT3Gs}n74|r%~*?a^E)PJ6*D+Tnm>;(JYd5XO8l*lfoRu71!#H%QHLm9
zFTQ4?w|3x*JW>fcNPEi1ZWB-DCb4~o7>1_JvIdk)VJJ-yYB0jVPsZ!vO#hH!dy;#X
zwY`}<Wu;J*yp5(6#x`~s{$@ZUDq)+tqlH_vQ!B)(E?9_f`WALZv8fg{Hri0lU7_}t
z%R?=Us$5cix#4fs1bdr5)}|G<Yz(6a35}?vtPe(G_<mSYLI(}L?57NT*V%4ph4i3a
zE5vPKYlLrN!GKnXeRd>8#4e0R#Ft(QgKhkU=(ZL}Qz;Y^2<W_*f;wX05>n-NrKJpY
zkHY;OA&8d(v4GGD@jdMheD^x(8{ZR3(0gcH_+)Ty_1+By)q7}^aN@OJ^nL&7_^ABl
znUboYTPDrFIp&?bp|8=~^RMRo3aan#t87{7hGDfMzk1(?f^G-DGynCV<LHKn(qFw_
zM?wC2zWII~r~ZEAKLsC^|1j&X;95>o9m#eozxsVf2smO<`PKV;6jbl;0gJet_P-Ta
zdN+>Buii(ZppWxSW2pEP{37u5t{s(D?=4YqH|Hb$Nl&FeqqRQVkWE#7^?nltH?zX3
ze3f6-|LdH82@6#3K~Ye>e~9v*fsZr)cYvcJlzh#N1YK_N7r%H)zMSp93uJqKw_EZn
zsPtD-@XIH0Dfm64?D^IER}@ryoDIL=ke7lFJMyddv?yplbAxiKd<s71$gh6is9+O*
zKWV>IK81S{H+KEidtVg9&ytfar~RJh{Hp!63T9_G!bt^G`>FKLa6{#*{O?srih|n|
zL88s~hHz4Q88`O)>U}c`-g&a(l+aoJ>yG^Dy)+6sL#(s@e@1@%v|98HDnY#$U~mnk
ztoD*A%YT&~nNXg7+r?qReBZzk&ae2=<<CQY@_m(Gy$>g$%4hpg|D{XGQ-1IuaAZG~
zU%mh9*v-TZej?f~m0!UE(Czun-$$;MnN&YlB2+$=J_nfzr^>JB<a}!--0W!o3W+xV
z&gJrpOpLrKSSum_jLM{vu5{U<qX<Fwtrb99%Rk_w-|Ol4DE(EvP6AHpE_2Agd%7h2
dvHD>b6IHp2_cYw3${+Zy<e)Qb?3W{{{WnN`uU7y7

literal 0
HcmV?d00001

diff --git a/test_scripts/ofccl/clear_static_ofccl.cpp b/test_scripts/ofccl/clear_static_ofccl.cpp
new file mode 100644
index 0000000..3db10ba
--- /dev/null
+++ b/test_scripts/ofccl/clear_static_ofccl.cpp
@@ -0,0 +1,42 @@
+#include"bits/stdc++.h"
+#include <string>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    cout << argv[1]<<" ofccl : "<<endl;
+    int ranks = *(argv[3]) - '0';
+    string str;
+    stringstream ss;
+    vector<string> a;
+    vector<string> b;
+    string line;
+    // time
+    getline(cin,line);
+
+    for(int t =0;t < 25;t++){
+        for(int i = 0;i < (11+ranks);i++)
+            getline(cin,line);
+        
+        for(int i =0;i < 6;i++)
+            cin >> str;
+
+        a.push_back(str);
+        cin >> str;
+        b.push_back(str);
+
+        
+        for(int i = 0;i < 4;i++)
+            getline(cin,line);        
+        
+    }
+    cout<<"ofccl test algbw:"<<endl;
+    for(int i=0;i<a.size();i++)
+        cout << a[i] <<endl;
+    cout <<"**********"<<endl;
+    cout<<"ofccl test busbw:"<<endl;
+    for(int i=0;i<b.size();i++)
+        cout << b[i] <<endl;
+    cout<<endl<<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/clear_static_ofccl.out b/test_scripts/ofccl/clear_static_ofccl.out
new file mode 100755
index 0000000000000000000000000000000000000000..c385526edf2c4259925a5b2a660e5092ab8d370b
GIT binary patch
literal 43928
zcmeHwe|%KcweOkyfT);63O0(=QBdO#(o9GofZisMfe9v%=7*&U9ww6^nUc)JnF$1o
z6<?rr8uO&qYq`~0+S-=eAJ+Ev-iv)&gJ6Sw*2Z3Ixz$=~wH-uzG3qO|YVLRKz4keK
zPG*8l<Ndrpa$vH~UTf{O*Is+=wSSzMvv2m+t;}+{40UB2|6owcJx^k)n6T$-d;?Hr
zR2paEuiH4^n1*zYz)5<Q1fa_6NLDJ$V|+Fs<tw6_H1Ir$HWXAE5~O^YuA_yLqM*TP
z%9l$dH~qoo7Jn_+GDoHr)bok*s+iva@64BY1@(Mno8k46KHQlKeO&JoLMhN#Pcj$f
zOLD#>=Toqd?Wv%$C+SF^N|x({ZqVp*i@%29%(ss7IbjX+D@bL~)$~%lZ6f70aK5~1
zDaQ$K<N7J6%KIqtkzL;F#pW~`xx7s6P|EdI(8UFOxHHtcsQkm7Z3{X>;hx?Fy_Jg=
zEGjRFMM}y92kAv+Q1h)?(_%pFLBg>((H65fP4rJbGHaXn(~Iudd(A*i@y)%Po|^Tg
zl}ZMcNj!8>el~%XJ|B0)BkTYDtXo{O(DMxALVV`oQ#|*!^M->@cE#?Rp4HysFPyz+
z#tNX*UJD?6W*W+0Q9wHUtPK3$&VWBFgB+?|I{9@O`0HVabo_}7{GZ9d|G!}5baG-D
z^tm`gy^dwTUyz~Pmow;jR|b7HXTbk1gZy7*(ElSD_%F$Te<}n1t_<}hzjNc`yc#p;
z|J@9D>N{?y5Tuu7!2cqHJ#UBnvy3@LRkswT;@Z8yUzkSz<B&uAdvB1$BBbfs&T<+Y
za)w|Z;@`peLgqIRA7b&gz~z@yOju-?W_wp8Y{ugLXxuanv$nb3Yzsz%?V(sa7;Ua!
z(HRK`oBgewK}kzbxwa`@R%)8@j%Z}7*%=JC$2-hmG#ZIETpch>P&;F_fu^{}6X@_q
z&3M!wiUZ+o^5c)U5rl=U{#YnrMq=@3(BD<tY&Msg-loN7<JD`L;)_i)(A(?rc*-Of
zm_p(9+NN@|slsgbm|ic+$jq0q;1wS47NK2vFx=LePQh_>TLfcGu+X*8+>yCbuNfQR
z)Os%rgu+r#WhfFeAtP8`8xF;5WX0ArMLJ{NhLw#?ktUOD1Vh7?vXxr>ZKl7oGZKIa
zU|zr3iZ&0M0e?Ku0oJz825Fnc?ZG&Pl3+UXjA=hOjidcCH%m(MneDy3W_K_ei-i51
zq4+j)OR2*`sck==Z5pqZ_7z??*6Gr<X`mV2R$)qS4R7sAXH@Ae!c@_qu;Iisv>M0u
z>%Xb>Ie$~UJP_%LbK}8*t)^>=N8ej-n`mnxgtP|&W{e_;$1q#MTSF+mF&OLV3Q~Lt
zsL+C_64-18IyRe|{Gm<*afkd>n9v^&_J-nwqDpLv1|!`;McWi2C%6$2$0TVSoVxtT
zylTY?b74uTv8t|i`HFSxa9>_hw#dM3_3~QNQ?gK}==UXM9+@=DO3JOfvXTnK&}__l
za>d^q{LRLEB^z@Y!peVH#&m(t6?`;v%Hz3C4qz6r^pTefGoBhG)Hu;_@wPLF{}E<z
zfs;Zp(HrpDa`Fopv8EYAoaaBjw;2WG8qaV#e(4<;8gh)^ary?qpKZKq$;&eQOd}0u
zg+gZ<(D!Y|^_&|zjb>)kc)qUY^J=^?=2I@iIKuOFg@3(R;*UD;gYzUlkH;0ouhNGj
zl1_6zx4ad9hc!H_BJf8v{5cx`Q4P;a96?QLc(ukN!U+vOSCey6!+%J_7qb4;7DXCf
zjmM-T%^B3C##h4Y<FguH3D2exS=2a9_zMI^{wZXArXxvZtE))E!y&B8t>H1GTUW7$
zXIBukQVp-KJ1R9im94HS4X@ggXg&=O2ePgP4S%|YHH>u{9u99^8#Vk43u_o18XgX7
zUELa<m#Bi)tKkbQZo}B2;i*rl>sAeamO=vV*6{lLdXI+xfX2U9!_&G*UHdfrY=s1T
zK*OJ{;rDC!4{G>94S%kNpSqp0z?221EHGt(DGN+lV9Ekh7MQZYlm(_NFlB)$3rtzS
zFkUbC&|AK~SMq%Qx&OHq+ZDS8<5{D_zP>|w2Zeb?D}KGkFh)i%dKRGIe8JyAl#ye<
z8yy|pP5C=$17_sdQ&yTbUPg``u+p^QGIH$uR+=_iMvi^OO4H`f$g$5^Y1(iZId+Sc
zrj3@7V>ei7+F%(u=C{(cu`+V(qgI+WR7Q?fTWQ)z89C;$(zJmxa_j;tO&cd8$Ii0S
zv|%!G%w?quMEbRNRefoL1ogMlv@wGETWQ)5LH(^XZG@oyR+=_IP=6~;8y~2@m8K03
z)L*4x{kg*WHL|^sZmFl%u+#O9wAYco%8{;cq!&2S7dg^%9qHMQ^qG!yt|R?UU8=rs
zI?}H?(l0sEzj360=}7;~k$%jPe%O)zS4aB0j`Y{{wCB~@#8cP%5-<4rUOdszTsM$A
z4O^YQfirK%4wP@;ssm`Q(b;z(%Nx5Xi5uTQ?vDs8IT@dgt`(+xbgk1yj~1N21MJ2@
zhT#94;IH^BflJ@@B~JJb{OKdU11Gb6uAlgxdN)1`0=z6BZ}jLU(e5fg^}iigy$bg*
zdM<DA^<6b`HEw;0m*O*g{a0O#Y{>`T868cw!I^)O`xy9K8<5v3pYk8u3L@RKAYYU3
zmREe}z=?++LOx&OUmpNy^Y!OkR4fF0lb%6e|Lj?$Q2#C>`uYYDK2YTf6FbSi#87e-
zJQqAip^D8@D0!}8o_T^NML^;wu*Pv@N<Kqk2P!_myhB2hM`4G#BocgyUnPG*NK1nv
zNH6#|qu|GbeaX_quIGe+U2ZA_3jIMQH*)e4nLJ7W)O!$*d;_}-rVhwt6(_$XnPrkR
z43SaeZ;y@+?K%V+tYY*%Om{^Ew~rt(wCgaV`-L8TgQqVMsGHA6DYl#oNX`|I(<i8|
z<HJL{p0EVcU9QOC7K%{G|3$Vr9fUr?=sJGHH}H>wJg`ev<{7FFeJKSc+4=sR^8ErO
z+3bc8D%hVweTQ}pTP4$7wnNt*lCa~iR5FoKa-XQYOGHUDRTP`9&~U+O@D))>32Lrz
zt<_xfgu$NXQu=q%oiBh2u;xR_$I0$X4i?-oxN|U$mX#dC7Kc#w31SkC@+f2{2cVKH
zD%&n<ztEGiyN=%l-I9190^L4G8Ma#)e&Mrp`wSrmD*mtJ3lU$l#rG|n&jfO4*Ac4@
zbSLW|%{i3Jb<{y}P#rXb=UH`l7qXL&zoqK%5xb~2fb0`RQ62ho=QC;Gs-IGxUs`$2
zu=5NOQhC@=(j${fwXdR6TeciZeqJUwvX{@7%x+G;`Iphr{p8uc{`zE_Z{ZBl`{P~5
zH&LAMB@SxkJPux~f<wurspVXrTFygYK0e(Rq6*weLJ~*FeYZiC<Sf$XmO~VYuG?@>
z1t-i@3KKWIo#;8)chlRhp0ZnRI!PU{=TbE9acZ`?q8hVHR#9Cmj*w6E=N3{5f*VlA
z?#C@*Kc=jY(A*_?4+5Z8O5&!IzC_OnU%&UnSnTexC}Q;WBfxo2QjlBnXlxpxj^_|Y
zV~6%Wl1+Tj=Ux;IeZ(A!LNC=$r#GI0euyS0c6~%vsWyrrZJ^Y9*Jd9qp)`94_v2~y
z<aqS{2FntcKI%&hV<^iNeUz+&k%B7!B-MY9R^kyj-gqT`gLo5&DI~-vrJ!<B(A}D#
zki#g-rj}tEICE7Y8Y=EdmDqBEMQ%|d`>XPN0}XDec0=Cr3+3HV@(!WV0bf7z*mar-
zhfS%6YVUqkE(UpJl?__k+dhX1q9GvlzzexP*RbY4EK9?v+ObF7ipY|-^dYzsH}6nY
zo^~&u^*LM`N!=&imSZJalD@v1P8jhIN~<B+U#~1SCqvmO+NR2~OeaG73@JeC>ke*j
zlDwusL{E4EO1ACJnVotgcG)wRlQeUIkO4Hs06IrP6zd~-qOa7Sv?E<^@{Tu0N43rc
zOFLvGX_}6PqFl3Ne9TR*Psz0(K4NIrY(f|BfI0UeyzKQQTK4E7#l*_H&o?l;84HFV
zQ&y_)PqBLPCA_5iKJtu&_prQ$V<x=A;vQ_D7iRH3;2UVc#|!DAc84YUh|JP5$mlmE
z`Y6yxA!R?{9xChy6s;4~&5rmE97VT<e}ltC07{E)dmW7tLtZgFB%eb9!}Sf4K9qcy
zhC~`3NQ0r|T$+mXm%>U^)L~!ZKM8xkEPcPP&%58Pa?%V+%lUia5gh@_^$c^)IOjYb
zQP~DkvwcToa~8NSU4c!hIiqO{40F!p>)`G?;X++00>BnS$(cd`c_Os}Dksd7Pgv1b
zEoiUCLf_>rhkXa$$_1NZ`7%wABnzxx$hZ4_2@xX(rRAv+V0i*4t)Xx?ZfGc$6Y+wb
z{|Ns$){j4OFda&|Mj*BCru|0GJg{>6rAiSl@j4tzgfUTxdNo<ACntb9q7t>XhJ#RC
zq5pshQ1YKq4iDhThp9|CY(I+I<OZR`p$>h_M%2;+Oy4W+lz`IDQN-@c+o2$$m=*{M
ztsJQ#W1EB13XoPDsBWolre0SJ2}jWD;7Di=(ON@E8bE6kkNNsH0R8+UZWuH9=RXtf
zP+j6rb%{SFA3lNU%3RWf_!7?(?l?H=5=ruaCto9zA1c1CdVTc<t)JWK+7p9Bp&nOB
zCm+}jL#ILLw0~?Q0BI=swCIormBwGEY0{+!eTiQKGl&69&X%#(LmvkFT<EKKw}~m0
zmGu)M3BMRX6c_-M0piD~4fQh@5NLk3%vHp^#CtS4Q(>+qMS$6ouP{r15mOxG(kF?z
z$&Zm1eTRIy7szHG8;{ZYb}W8~LQ9%hJPiVEhalS(!FFhZ9?}?aDETjFJYRnil5p0c
z<b5KEu@<&MSHKLU?@&>yKDJYdF%9c<np@BeKy;HG2*Ag4MU=CPM#-gC$%z)&tbSN;
zIWfdgH~^{NR<4!i3jH_HNO7eaDa!s!uaoc?C}5A4%4=mUxhz$^RU{B$mMdQ=l0zt#
zA`!HDih_9J`Q&CIkb~NCQ3bo^sog0C?$nOw=&Cem4ke5KlsfNB5eOSD^PY&$5!M9r
za;aL9zUYs5f6{&Y*_0WI$}GA+h6ZN85-`Q=n$B|I{}EakHW1<Jvrg_*y%Pvv6vRZR
z={ccP`(nCWr|?jk`654Lq|O%FUQKCDXOI&bj0tpK77AFgDz#;#tvg>ywY3%D#zaRp
z;4()B>99wfPP!kVkFIR=`lR%kp=1qJE`fpv&<h5rUx}V@=Ib&<II4UDO43&^=~PQ?
z2UhWIYPcod69r$Om40rr@;LW;G2<9YCXu$=UdC?{<m?IX^PwcoY!R<fg`<?D{;GA1
ze-=q}3`~>~2kNkehfeVgLiQbUBWkCQ*&GHaU~6VB)m8Y3chn_&Ir$ZtN;u*Hh_+R?
z8k8%J(F#<%y~$!$Luh4<u_VQUN)(S8^9YuxA4IW-t=3!S9gbfKjIha*JhS2G@Q6s#
z6mC$41z+Ol$zfpx?-4gTWzv28N1_sahbpy5F}74y;sD}|J%k*EC^X~}i5g0}g&SGH
zsY(?8JhjdoqKJ$`$!DYjDEk2_`}i_4H8CwYM(#kJh3Y3f@HGrlNzcpnyu*uoo_DOK
z+f>D2E%KL>uoeQAkOYV^Ld;fZ!iE;M+U!YEgt|01?o_iO%lRL}jzVfoEVM+YM$dzg
zgBhaM{M?Q-yjn}hLKLM%c_EA3qaW)8<!sIYPI&fLNKF;;e<CL<t1*Vaqe=6HN$5Ft
zkaOArjyjaQMu;Lia4ZzQ+XSpuQ>PLl=O-;Cgif85#FpKigU*sI7q+~e<Q1d3kb{Ad
z#xwTT!w5sNVnB1nZom-36}$G2vSKM?m~Gh&2RD@zoWB!_SkI@bKUOzz)$1O^sJ^B)
z@l16~ZQ{-9=IX@TExv&TuK?Xtcj-G&CNDYqB1$>%PIml!&$GNAUzZrc)UPJ-$Li5p
zzxDMUborM2rsrkUF7LVxG|8*JzS=yv$u5$bj2d1zFFi-Io}3Adrv6M>V9Ekh7Wn_x
z0xtZfMP2F+gS5b6>jS^Iv44Y8h5H@&h#y=KGcn*gCBKtN%R^Hl5<_1BpvV5x@u4zL
z!-pQ@pMlQ|e8glGf1-xpa7Q);0-f%qZbRP0gR!{V-`U=}b*XWgb?L;`o){6g1OxF%
zbm>yF-o(%A=tp~IC>#q$<EX=&>}B+u^9Gbv^xM(VLEP^-idhHXK0y3%$tZeZbaWrm
zuHTQ2o<#ayj7)JTcO39ez#I(0MZlj6SP8fQkbY0U9<Ud1JK!F`djR(X{vSX$D*rNI
z72w&J_-q7R1h@n6R>1oJhXH9HuK_ciBw#OK9t?I5U@_qRGy?*B60m~|iTU%bfQ5kj
z09OMZ23!xA1pF*u9*lh$uo$oi6L}xt8o&;~>j7^CY{cy10YE?CFyQ9_PXIm)Scrxi
z1}p_!iPiafzz)Fefa|a;vKR0^z(K$y;8DQyG5w;(?FDoLeipC_@CCp&z!_L)>;zm0
zcpqRl;32@f0AB<=1egQIcnfeo;L0~fM{5AL1GWKv8E_}y_W|z%90oiDxD<P`F9Oy9
z=D?|FXLml}TYx^mqW_0_1J(oH3fKv_4{#^oVZgfplYlcY3!MRnd<?K0(2a*r&46`)
zalmfCI{|kA{t)nIfKLOSj?I)e09OIdf}dOqSPu9Nz-GY1fcpU}G1bJ2HN>TUBjdWU
z(Qx$^y5`K7p0^tvjqrFXZgjv$KTMo8)5RA4eB74f)9@U8hdN|_;mZ7?)dgp4&D&vo
z<lLo~Eu4QLup<9@z<s|#8xn#qk{icoFYw)zpD!xwHhf;hREhSx4R3zor?Xa^IjtTF
zAxZS_;PY$H>1!?)eYH)00_dNCF7#FTH`(-8Kz|N&@jEd^zs9D|fIj2VuQ&yr)N<X}
zP$w)o{qDJ!8^#=EPqG_*T(IYt(6@35;3!H5Utf0iZ8<^_s^^_3=kw##<A<Ps6?DkA
zWwhG$cpCKkGt|?KEM!Nbe-tq1E3mH8Bg-d*5DzWe$QC~0xkcl-*)Hcw#3<vX(b1a$
zY#CdudaMV1Bj}gt^y}^X+d=ODo$O-ezs{!L19~Uu;&)$4$6A~I5a`!~o~b>bV|tk`
zKV;{B8}x4EFVN{LZTfkLouvN~o!)KJuLS+Zar9dcI_Y1c=MUQXw{!lhbb8#T-vjy{
z<fpO1vQO43!9afU5a@?NU!>E!?EKGx{ygaE>PI&JA?p7m=x+e#d<_9t+1Lwz{6m&i
z&V2am7cq9ElS6vY?`IRBXR>n}=y!vDnXaeb)^jK5Uk5#1I~OC*da?uPJ-E-Ar|DTc
zfNs;5n-#ES4MWxo;JrXEYn`OKfja^Ei=dy->8Nkk3$~0|7`q<8SXig?WUUYkRNr#Y
zp98%{r$YvLKGB;&H!wyn*6CqeejM}<g8p%xp4DK<zY}yCgVWi=jXXaDeL3js^!!u@
zEfyXD&ld1xviV8ScYvO$Zx+GKp8@?!y^NS$hf2`z1N~y1zQLxi1N~c|XY&1C(D#F$
z$#?gFPV<aRZLlBow?MDZ%kQ+yKLR?<OXlhHYi;^T&_4irIvZ0P7GWNd3woy5QVDtv
z=pkKx)<#hP<zEN-nsMa!f?fssN<IHgQa?9vdqCd=db>_vbz62{PS(eWi^_NiJl_J3
z_?@)syDfGdp96g#=$UNvHs}wJlm9$~{f9s=(sfvC%fAxzAA+9FhHm6p5BfJi-=gOy
zKkbk*<^xZ5JPDo$ai7zKK&0wHeBZXq+6P$)%onfIWo0!A2C~Ir&<}xrg-+MvLK5_0
z(5XzT3~j8*!*KjG=(BZton3x0=#PW$)9KLhMzB!%KG0tSJyR_206iaj_bGJbk8u85
zLBAaIbhae<`#}E?=qddH`9H<^4}-oM^i29ELBH_X(NTO~)~<h6ji{g-s65O&*Mg3(
zrrC5Ek_cl^th)j{c_<@Dh?V(;w`bMn7ya*SZ@zmV$D3b#TW(E$>8Gdp@+<qMugb3q
z=T}zemsaN&FVA-`&o5e@U${I!PrOGV4mqWegAin0_zvCKSAPOa3Rmu{dB*)&|CEas
zPn-r8P{ap!cVFb1v+P>Kc)~T}`c01Ocy=)Yb?yaE<r=@w1t+FG;zEwRmH1|GkaFLB
zj;r9qxm#VIbDfT_?|j##@mYmW-Aq|v$^ugsn6ki>1*R-8Wq~OROj%&c0#g>4vcQxD
zGFd=uB;flPSc1}}-m|CHpiW48q;x4fy#tW084}<jkG$c>GKO!NOpEn6Zq<7L=SZ|z
zN7Jo(2bp^BAMF*;h3yV;z4`8Fgl_g?50x$~f5k=nE_7iVOk78pj-|P{W-=olCy7h!
z0pU*UFEEY(XERQQB@#P0;wt7Fdfr197iXw<sqN2C#aD90mEPjn5?HWpAud(E*y{pD
z@!^Z&qJFTeK;etMBE5$Vi$}R4t@OmBLy=z1bUYFhX>JZKZSsJ6pSn|Gsas;I7&b86
z$grE?4u-oK?q&D@!$F3_439EA!BD--u<!yyTyBP?467J6Fx<$no8b<IyBSI;6aTB;
zJE<h#1))pb#VxHp;dqaG3EmD`y1>&TQl3whm6nv2mv}Cfc$FhF%tr4jQVp%K(c3{Q
z)8ey@b!qWA#`d)MT&o{x;?W1D8LI!J;-`!KO&v$~*>F|me>yE2-md&M6@R*+`bR4M
z3`6y!R6Ly9tz+bUHXi!A)8g}uy=mGv+bA#|NQ*DD&TY_3$;KP3RoqI&V^_0E$H@C^
z;{%3zFTYOB#?#|HD|C$FpKZ9;q{E+W#Ydf*jcB9dVk*AKNY^j3jdQFxmdcMP+pS~d
zeYW;KaGjcMohD&pUAWCLUK_PB@{Yn#7NU8>yi^K>N5b!L;HfzXuf`jdi=H*PpwIrO
z#87#3(enjY8as3WPjb}zWQmuqZTPs1vG&{pd^-KVC-`R?c~!W<rON%OxSwg9Rz+AF
z|66fC(>T3~ur~e`aX-g+(4qfd1%9S6qY5{;RGhvL1-bCX{lY4Nw#qGM{8<h>{g8q9
zXFBj7XZ$S2EBU>QFLL1Nx1=QJJO}<qjCVWm&oX|#0}ta1ImHhA`Ha89fv;wKsRMsK
z<I5fR?ToK<;O}Pql@9!)jIVOwUuJxb13we-k?i1e;FmJK&Vg@be1il3S;jXz@ZV+p
zItTt4#;<qaU1&JcXQKmu1>@Ts_!h=@IPf<zzSDvK8sobi_@6UA?!f<<@x2Z_{ZNtg
z-0r~FFn)&v-^uu$4*VUAztw@?&-mLM_~#kF+ku}xP3m)}1HXXrdmQ*C#^2+>-^BR6
z4*WjG-{-(T$@qPYSM#|yfTy_hjbcMw;&~PB=3@d$cs1Wv^I~+bv~h{%w=Rq$=dF=)
z^uOui*h(=$9XlSKb0L*Gc0B#KkUw@DX%%wL9y9L57_a8>N}o>)Ib+A2dw{<X^$nFu
z&N9xB14E@dhoT60;{W0z$=J^PpJe<p`qCDzD<v>~1N=<B*TFUPoDgz8beY78=ij)!
zi}5=zO~oH){4?UqNQR;GY&u=a`8UQZIiCl<5UZ9V$9Vch<{x4CoXvQL@y%SXm8|SH
zA<#{$ABUXl&yeM=@kquCnEyY3KZ|NyB@yVJ;(CGk+Z^@sqRG?QXEX3*pB)Qje#LV%
z1OK~%p9|B#xmfS0yc*hAh<3-QW}G%(V(GhLbaiCle*k#WXI7=;dr$&nX}-iCxlAH%
zW;~rAm9E@JfIlniT*Em(e}d&aD858szhiw?0hh~ubqUMg#(1aw{{(!xa+eiKIa%V%
zGI++#at44;$N#Gg{Fk03<+Lu90_s@K-N0Xfda3uv!|lcO@4%;%KOIQ2=fD!l=VSg2
zjNi@nRQR8x;xn_x&U?Sj{1xn1A7(iRfXDwa*ID>dBbD3DetUvBB8+$T$L}%zNU3D(
zVg7koaFd)~ZkLN0zm4(V<aSg#9A^9)#xpQ{vt^u7@6lIwyAF7g?_MAUbGJ11Gu{~=
z&OAGPeJ{y?zZ-ax?~Ete*f=2kCEV{m&h-j0{vWtqKFatXGyaY%Btm@e0=E^|NKkcW
zRsm{(C;9hr;88r^Mx3Gk=$<FJRXg6#{J&&BQS#@VlU{yD2K*;8;QuKD{(j&|&wIIj
zOSlpR2!({dmi+<UO<bE8?~HGc1F!1HtOC3QJjDTLe147PoG6pr++B^2Vnd3`-NgPt
z-zB8$4~%!l^IB|B5q}mtn(C(q8UGmTuXMX&uH-M`c3dLG8+QP&co-!>E&@opbCk{l
zp2}@wIm&KP#y7KI4!g#qjK7BKrR@0+@CTC9%lvSAaox`NgO2|Gb>L~dbB<GC#Q8$3
z*9N03fWZ%hzop_R^)}0Q#-G)0sn30m@oNw8>Fo0Z=3l!MLUAdcb1+a+y`EsdQvLc~
z;7L!XKYWY%o#Xq<z$=-u1o^iB^C;>^!(48ViRS}P<<4OJ(f!4B3FD8jpM)5{lks<O
z99YNrcNl*gJEqd14*izo-_3r-VZ!(V@QQ~;2=Ff%@K3Xxjb)Ns`R&DU7?Qu74W?w?
z&iG;Ww+1fcw~W7+_1wbvnv0~I``JFqul6v$h56x@;`$5Yzr*^}Fn$05pY+M&a+Tk{
z2)vtlQUJy;;*Ya@gf(%QG_IvE%EVUYclz67g1^wHS|<5&Iqxr--x;4TE0*Q%<o1Pm
z#dSO5XR;ksTun0G8Q(qvzajnCaok36;`$upUuHj1?fw_w)1Av!00xq?gXN%^#I*tV
zbavRz{6DLdeAhAm3&6YiUI(u+znksM)izpD9+i6`^rGtn5*V9-ck{gtb}@f3#~*HH
z<9Wt6GD7LR5^0j}^oK72uXuF%Mj#rG#e49}(|}>0D`&>L%mAG%7sDA(Z4tA*Gt%ns
zG~4hir<m#Q=`{k8uI|oYJlIxJS-J>kOr>KoaZXgo^hcxqZDueWk8Z;;QvR->+1At5
zwGCOc6n-uhil?v!I)gZg4hK}l>DW2?T~<k;yBjBZHO1>ZOS(hBKyYg)7BmBqFwQjV
z3E-eDL0Tw&e&vUv5Z5k|i$d+;NEF9a5l5wIh9XoPl53XWc)f5BonBX0=0PQboBTbU
zaT7n_+H7`5IzxeN@>sSyPbp}@sGx939@NN;M|4!Fzb(=&vy*h3ZB~Kv_=3GsOMf^X
ztE&hFa0(nrO-o)Bj>LLeV;Sj{kRR&8*;^So76&4oojAQKBZH^ZyQ03jDG&{H$1^fj
zi1Y3;k}HF~aU5wDY|F^8(7$zaMq+t;G}xVyS`i$V>?t#;ZMKCYT_JyGMk$_!U365Q
zYSUBlg(GpZHF9cs#20*ze65kK@2TA2IDAWFHT+@WifNt3Q!aT(wQx`8ICWSg8PRFt
zIDIzKliATc%-S6dZZZ8^a6DY*nv`#fL?QUprY>d1*f@^kQPRQ@beryRYQ31HZ3;zW
zaWfL`9EW$IIPDSwPti9#%opF*Jx&iQlU%`Az&{=v7ut$L_M*W}neE4gwno8tO0I5y
zG#EZ57s(o@0_Cy-IAih@jI3JJA8sE<9CO9G$1SwNDipmx=+7K{Jd3y$#2|xX2*+ty
zRWzMs$W}pwZwtoZu^1@KCJZfGa9UxT7)yd?NQA6%1bZG^XedA&3Eha11_!PBaZ+L%
ze)mg53Z+X!A;Vl%w|04T9kTSanzg1l=+HD*HdfbrP4Ai-I+oSMIm@Qk$4FmIqhZ#3
zd`)$I?F!IR2Di3UggDsMDq-!)l}%n8u3Ejk&dW!TdKTFu8XwGxV?8q;+nMp?O|REW
zqZmsI;+V}vW&;k@?BYWntE$AIjW~aiPF)m-ks{JYgBT1koS6RFhKUr4LZB{+@p^G&
zY_r*fCaE+VaVjhifn{b5nLDzL<`uPdt)*TsMrF|ik!=$z)9c+L3KEt0pEw_tO{dPg
z#C!;&`ve=Qrc4}qOM$-3yhi-6TpS!bp(a<R=aY0NS18V^W&J{!44Q5JxZf&gf|Xw(
zDjy_KmU~Q=4Qz9gx=K@f2IyHB4sMl4?$Y#eLiYAnTHULw;c7adSDRwe*}j;Ec3~nN
z!r9HDkcsuF_#*knfm-fei_p8gs1Z)qU5tKJyB5I>N4c6k7--*TJ6fmRraQSfrC*RD
zS8L{ef2~nYhb>c2a-N-BRizz9E2-){TW18EK)jQUP;REwoX)nTcsj9TtW!kE)}*t%
zd&1Vp6~x)Xm?_EN=~<{~5{Wpnuq(0!Bj4ZUb6x~)6~b4TP0KlaD@UAwspxcZF)l3W
z?um6E#$f=516PpbagHl@n8R#~Ma&M2+H?Rq@>QzgWMchyOGC4$ndYaH5f)Mg-Ujqq
zae}p0$otzF5LP)f{2eEvrx;$kG>s>d;#^P=Mv=}9Q=~ghKI6QGBGCJ)n@8!Uwbx`g
z#S&}pg5vmUd=SOS0c*O~Vy<M=e5?e^h7xR1oa*fLzv+B(8u=XdS|qxTr_@}bc2g$Q
zdsd6?tkpnla!f{u;7L(ad|_88T-yb2#ssg7{yTi;Z?Z%t&$+tE8espPxP}QwJq}k7
zVlSx^OEydh0z%sf&W(7a)h871gBK!BIQ@{u?a8r7WtYF#jM45DO^pydunhCJwMB!m
zSY*?LT+WN`LJJuot_<syvBM%(E|V3eFuNk_$c0Qbtz{Z9h+!?`r8QJ2%-zabLcx0`
z!}~oISjJ*5XD!Cwj}?NLd5g&sR(n`;JL}RE?~K)E48o0MVzFmvMzER-`nzhej4d_2
zFj?c(nR`)Fxrv}b7W6t|l3YYh<{~KFdc|67O-eg?&`xT>?4`9emb$dsl@1Ck*lQ4)
zo=(W))vQTdCr;uHfj%Zxn5)Hnj7BA~SzwJ!=Cy5|4XJ~O=z!%k(X4IE+}s)c$Ln27
z=2;XCin%Jry&$%^0zr6@w_I$-t%V^)_ypJF?T9$)D+aK!wH3Y(^*n(NJe1&Fn1B~s
zC}X@n0EHn1u|CD0p?P>~mlL(7DbgA9Hmq!HiZq$29RSZtf|2fEx{xxN^|#ln)WXhL
z>12#AnA(YT@T5Aw-H&K3FS`P5=wYZ0bZj;^Ve`+9U*%W?HrKCETXo{W%j9jXU<|8Q
zy9Pn8u;3&`{lyTR`Z~c{yF=>JfOLxo3O<VwRcS7%`pqO(+7c!9Qe+cjc7_4C%fC5j
z$_3{pY<C2E13^3zvlgEdi4D}t97|ZyQ8B*A<yupmo&{|6N84)08*IjlKA1+N4?qYz
z9Dz9cSXKYKLpYY|_1NjOwv6O%EKQ7}?`^0JjTve)n|J(?7Ml}+NRJxy<wm9m>Gpbh
zVp~<%_@&KQ%mQepgb4v#(b=Gx6(6V7y1P|unUvw1o)+L4T1Pap)gH#-e2YWjIG#yj
z-%hO9@Eo_cwXW6U6?MVXc#L;Ws4{J8R*VOiy^b}G*j%LmhgXV<Mb+QBNKUip;|bPe
zSWYcXCMWQbuDov-4T{!C-Lpggu)6z%ERBemsk38tFQ*&Q%#&dp52m>E^@t1;!}0yg
zY>maVcAwzVQCgG+zRFM}CLSl0*W$U?7_WfYJ43AjGZ5<$Ppx1DOj{aw0G||V&>Y!X
z28z`t3c=7(?p(W#8AINWCs43H#w?fu-|Z0bd_qwUkL(cPL_Pk_c*((%-5+1EF!As$
z(2?2!(rgH&GEhrTMq^d92HQhn8DlBVP2$kE1VdkYFm5(?tI1O&I+?+IA#5k)L2bB}
z>U^f(^i1rbBBsY`=_S|yDGxFyZ4|JcTZn1uB(8Mqy%*K#v?*j)Xo5zu+>Fg5_Kv)3
zn#^<$yX205r=l&iiI;b#SlU4$ui4}i)6_}cW0fOQZ8OcY@Kr2Kn1WHP1?a17&Hh&L
z%pm2V)MTu9Y)d0<wuHBa(2E;`v7W9V)+8%dm<vlvCvS9-%PTu-(|7Kq^hxZkYJ8vM
z##yYM^22%L&bJ7m!5b##jNVo_(wNB@t;glls!7&f4XP+}GGcAHwJ(a`J(){<+sgDM
z47HOZEWI#b?O9q={z)1-LcO#WmVPPbt>SAl_M+VSjtN@D3eJ(%Pv8p=*sz5Xf9njF
zc3o6}rbiGBh|-PXYbIuU7rw|Nm5_t9r*iBz@nmii>pR3SG;NkOp=1g}8G=w#JRE#~
zydKW<KQe4ja__RYH&dsq6pB)}(TtMVwywCp6)+x^utVL^!Y$fulwegCEWtN@OS+@j
zREx&9X{gqoP-okMP#dGFm)9<c``Zn{-r<jR7$t4n!YD$*cvMoh1fwy0KP)9-f(Bpq
zQwF~4?u;8H^q}4-!EH%<gl}TOfKh^db|gi_E{R6OmtIPO9sIrMjy6bBDHIb3n7o&Q
zI%41wQsq~sr3`hC!u?$#h?fGffG|q%b?q*E1v}-7-xEvVXFi4tpDfO<-oK%sdLNCF
zPQ3a>-}|4AkIG+}EvX8+Wzzbqeut(I^s@^+|2odEp!y!a;?Gex3~L?v)q6h_bUXN+
z`9B6aj%|o2{ndMR6y%==Snt_!>hDMXGw@OQkFfp<Zsau8k!+{(tKVsafFl-_U%l5y
zLG>OVu!zfP{~Lg%cjT!2>b)ci`Z(V-hKf(YPXSNw+)-)u{t^ZEaz4_Z^i=vY+8DqM
z*;M6M?>SL$2P>?~SNT=_zrgvIvq1Ge6b03Lh$#O#_&D=_1vn}~$v50c(B&3?@yn;w
z%h~?-fUM_tyCuJZN`ED#kZ%<HCenI-^&S=l6<?<O4><Cx_q8afpRqwQlsp9=bmUjR
zb5!smWoXKbOXZU(oNJF8U4Qld7X{B$j0~OjdyMm|_BX1Sk>PX_u3f7A6zXTVAr_Ti
zy-!9#I%`M2oaH|YqMl#9cSgZmPuFwX{-dxjIr6Lb(<tbSvCi`UjQseCwdflpQ@tNx
zcs-@Y+FR9M!E7i`zwzQQVZC?YDCbvvLKyz2w1Nf5Prk46tM}q0Rrzc`>c4cUe98~L
z4IJ4|<yY?kJ9#~EgP(}nrSdCS1iGHz`kmxPnaSaPR{2Wghme_Msq(A3Ip0PJcR1R=
zN}{d5^SJy{3nOoeHcE(}Ri#{{D_uHt6d~xojRF`O`G<V;yFC*htKO0<-fyOKXOsZ@
jdSvQR^#d;^s&W-Ce#UOqUEv6R;;S-6XV&PKBWe6M!5R;E

literal 0
HcmV?d00001

diff --git a/test_scripts/ofccl/clear_static_ofccl.sh b/test_scripts/ofccl/clear_static_ofccl.sh
new file mode 100755
index 0000000..1d70cb4
--- /dev/null
+++ b/test_scripts/ofccl/clear_static_ofccl.sh
@@ -0,0 +1,28 @@
+g++ clear_static_ofccl.cpp -o clear_static_ofccl.out
+g++ clear_static_ofccl_time.cpp -o clear_static_ofccl_time.out
+
+export DATE=221221
+
+for cards in 2 4 8
+do
+  export RES_DIR="test_result_${DATE}_"$cards"cards"
+  export OUTPUT_BW_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards.txt" 
+  export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards_time.txt" 
+  echo  $(date +%F%n%T)>>$OUTPUT_BW_PATH
+  echo  $(date +%F%n%T)>>$OUTPUT_TIME_PATH
+  for n in 16
+  do
+    for w in  2 
+    do
+    for m in 1
+      do
+        for iter in 1 2 3    
+        do
+          export INPUT_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
+          ./clear_static_ofccl.out $INPUT_PATH $OUTPUT_BW_PATH   $cards 
+          ./clear_static_ofccl_time.out $INPUT_PATH $OUTPUT_TIME_PATH   $cards 
+        done 
+      done
+    done
+  done 
+done
\ No newline at end of file
diff --git a/test_scripts/ofccl/clear_static_ofccl_time.cpp b/test_scripts/ofccl/clear_static_ofccl_time.cpp
new file mode 100644
index 0000000..4c49834
--- /dev/null
+++ b/test_scripts/ofccl/clear_static_ofccl_time.cpp
@@ -0,0 +1,37 @@
+#include"bits/stdc++.h"
+#include <string>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    cout << argv[1]<<" ofccl : "<<endl;
+    int ranks = *(argv[3]) - '0';
+    string str;
+    stringstream ss;
+    vector<string> a;
+    vector<string> b;
+    string line;
+    // time
+    getline(cin,line);
+
+    for(int t =0;t < 25;t++){
+        for(int i = 0;i < (11+ranks);i++)
+            getline(cin,line);
+        
+        for(int i =0;i < 5;i++)
+            cin >> str;
+
+        a.push_back(str);
+       
+        for(int i = 0;i < 4;i++)
+            getline(cin,line);        
+        
+    }
+    cout<<"ofccl test time:"<<endl;
+    for(int i=0;i<a.size();i++)
+        cout << a[i] <<endl;
+    cout <<"**********"<<endl;
+    
+    cout<<endl<<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/clear_static_ofccl_time.out b/test_scripts/ofccl/clear_static_ofccl_time.out
new file mode 100755
index 0000000000000000000000000000000000000000..288fd07ecb83c2d1cdc5844d23eeaf63ffb80433
GIT binary patch
literal 43936
zcmeHwdwf*Ywf~tsASx!2f{o&15Y(s`Ga(5<@tQyeCXzs!hoy=hCX*o<O=jZE1VVpW
z;|<(8#<bKQRemjPvGw+HTibeD>qS3QgHXf0zt+ZHucbcP)cTl+));LsZK?Tv*WPQN
zbLM0w)O74WzigPSv)5XC?X}n5d+o=0>^t1Gt1}!9O@&PDA2f<N7fDPd6ZU?CF90gF
z3hi|Kb!z8pvk=b`I7zRR08|-{W+cKK#%BUjx&pdL0?(0XO+lq0K}we@jO0s-f*Qvu
zT{e-N^asHy{u*${*)p!6HJ!+>lKE}$wp@u<(3*~HGrUpKhdUCXhs!-Clmd<QBy&-^
zIH!wqItBCDo(d{^l8*GNV7Yea1dW1I{M9sjx(%Gp4y&18LCS+d<7?5@>Eu_>>E5pt
zVl+Fvhs&p+%I_0MM|SyZ1DjK8;QUh6Lot_IL24%yuILD~l$T!7(Rz7DAlTh=c~3?8
z<>jSCkx)^Ipp#yd2UXvib<G;o9wZ!#6Lm3<<3vCI#G5a_AAD@_qMM)o`sh<De&>9n
zvrx&PJc)+_rDqcu@$+#-JhJ@%mT{+J9(tapU4Y*r{0h&#=bT~xGo6w9W@ofDd-Lb-
zp0f(*q~UT1pPPj8XJn8JKQ9IUw^QKHNFj$xmrQ<b3jR76A{l=y1^;JL@c$!>oJ>w6
zg+3RhDA#BT{K6FZzL`SL`%>t0OA7pNQpo>B3jME1!GCcI{Ie<W_oXN=wL2$%_Rx?*
z|L>&0Q{Qpgg&=-a3jEJg*z@y{e}=X|tL&1(lwbQQ@E0VJ{~wS;{CjVe!~(=A+{SY1
zZE}WSAL8$2d_MDQ@DGt_tM9VQ$S0I*y580q3hI%lHyqV<O|NOH(_8&ve_J3D^@p44
zR&|7e{w8lrhhNf?Q?74}mK5uHv^^Z!s(1K<ZP9k!9}b7Y^=o~a4r)iF#@85ixqR*3
zupSM215qH{jb8k5H-NCT#TyCu^iU)k_Io>Pn)Ie(-QBoMZ&<soF}h6GeLX!cm#ai_
zfhiDdt7$CN8_V=2m+p2WkJNlAGhXF#ZxPy+`h%?<$rPMIw{jS3nwhSL=C;HY){?Oi
zPOkS-Umz$2RRls29Wwl-HNil%S{7_wW2htIu3z2I7;4nXMldvNDQl_4+p2pzIzm2}
z0Os}TEvWOL?(;@{?O<)~sF$``*5;35DDfvV&!qZ8qp{Ur>S{@-KE17{NAL27BcY(T
zBM{xDZz;A}D6#IRvQ5KUX<yOmCO5idb?R$EYb(=bYYlGgOlDNsT7;>>eqqDuX=qfA
z*{(lS?Q{CZXsIvM9p%bH12(FzF&h5sdfRks3ofM1=hGwPL0p>N9NZc}_6`0>cc-8H
zi%+>0cop9*y085feX}>vp~3G^dle@1M*Tg3D50njo5TK4mtWB~2dEKz8XiX{Y3!Uj
zy-2)f)hc~yQL(nBwr1t34I6M>T2xZ5;j(IFjqWO1YNc4Oi%MKFs+Sa%8doJnW$>Yy
znDu0fzghU3iTO$<<}!qp|1z}M0-r7TXy%l|bDb=}3}EReg99_3YDCmHQGe04(}@2`
zW^jO$Trtt>@!N9zevDYNv>{IOU*Ee08DwkEb3A&<y%-v@wBK<2R>7aCy=}<L(7a3|
z4dw*`r)$vn1mn8T4xB<WvspY}SMzx_-e`*{m8Ko#`MSav7E1hx4L`U@;&XUhQT!@?
zBqZrH=X1(S@pssQXH^9Ls0DwPg@43?Utqz<EqJxYBJ!98f3Ai9xCMWn1)tCQQ(Y8T
z@M=6J9cj*>f*M~5Zylf2_)2&-jYy)#X~HiQ82Klk^_h(*<*h=21&;<{1g8a$A>9at
z7CbivK`XZ4t?P~o3!d^;q0)j^bxAaj1&;<~gnA49R0FGN8!UJ<cq44G;O7`vO>4K{
z(O`|xWx?|jRnU4Y_&kGK(|Rp<>QgG*Wx=1JkbpZac<cOnw*~((3;$jVp4LSw?6ctK
zD<t5<7W|nO{C*4m;}-m&1<y-)3w7pl#sV`In6bc&1!gQTV}Th9%vfN?0y7qvvA~Q4
zW-Rdkw*}tIJMT}PzPEBb{n@`<kL`*bgVBtMVNc)ToI}De6J;-~)3ou43tt4tJ6|LI
zcA|`r{$^rgVyB4H2F&>AvqqdYUdBfc8gbfi86W*OBTgGF<D(B4aoXG&AN_(6rwy0!
z(L0SeZM2M!-fF~YgJpcwYs6_|WqkA#Mw~WO#z(7+IBleikGhOFZJ>;gE;Qn_aWX!7
zh7qR?lkri95ziCxcRx_&r4164--y%32+D87X+s3%H{!Grg7O=2+5kcMjW}(5p!`Oh
zHat*%6^HfD71poTg#8h3uA|y8<8`*U+ZMmt7B92KFSo@nw8hW0#pm1Nr`zJ$w)kIa
z6ZQSQE&jGG{+ccRvMv7Kw)jtN@gLjbKeEMtXp4Wx7XN2!-1T-%?Ae<<u~$5Oua4C>
z)edBrW9!K?aQc&|D9^yv2T@%U^Y2BH_jghh7oLIa#|bPy9-WV_6{Kr)ty3mO^3LxC
zyLN~n_`e|d%kCy{$p@a;nCIZ{uJIf^p6PKs?RoZt=sXB;vw)n5k<FssReI`wy;r}D
z=Am_8*6iuKdVDP|J+arKb3FZ5UxZ}w$Nn-g5pP9femeWd;B(xFv_}4vesn8{bkU4-
zjh;K-@}L99eslooJh2}>4AAQ7&$_Ts2zJL^gP#8R^GKoo9Ypl>4Z?k($YnZql0C7Z
z_!_ia@QgqelP6#DT+KX-1W$s1*we7aF(isVPhtnkKE}L5LX)RphjU3J_+r0^|D2G9
z1_h9w_o$Zl#9&{%IJV;@Az+7-GJ!&`%jhPKUM!;{1VH^c{F7&3hsM-HGFr*eZ%Jku
zB@F{))aZ$ciJ=`wK!a7Zz8}$5LEh)dX%6l938VXk9({wSE*Gdf&POh$oP{K33FPz%
zs^i%3(2ik4AYEmN6i%TC<@^rW=2Q^+1f%2FQP02^1$khHEX?y%o^trtp&ic|qUb8q
zmidDu>U)1CQACEQK2aLSXfYZCZ2tguQIX}wYr(e$N_t{s|F6^-_HPgtILf*7@1QG>
z4;ry5L-8W$;aPqt@7}@fgHcqK<QO(Mgt9JT5;gxWWXBIcC7D%;nbjLYPfG4Mb{}**
zQ|dNI3BGM4aF_`mC1ij0J02zbi^RIuV11qmXh!{nRMj#RKk+9K)wuG%fu3OKI~3m~
zqnnuej%1b<G!!31bU!r*Pk&v!)w6Vt=rYmHW1FdSd18kw`Rq!}=c>efmL%r$Suh`)
zZ3<Btt|uX}qg3bBC`SC(6u<K@b>ACqJfwOwhWGr~?I&X0$NO$S;pi^8^Y-J^YP&B%
z^*&0~8yC9F|Kl1eYuPt|g6&?U7z8f>ap!*+!k(a{Ptt%N--Cu?$t8CCaZjv!%+v24
zn~dE#8AXhqes~4<aq<kypNh;P)UhnW+;~XLEXW~|`W_Naj8N#S$Quf!q&J*|ei&+7
z3M52Jy^#f}1Fa^jV>PB#n|&~a((C}Pr_$`1sp$RnhAA$2$`c#L0FW&@0+|Q?l?wk1
zm4CM-$D?S7Q|0(g;*Fu>k`RxSg2G8bU$zJe*o>lVZWyMX6IbS=rlPJyiOpjya*Gn#
zUzy_>sCP=W>vN7>Ag_kv_X>>;dis&ZEYn=H*o1PZ`tDcxVrW!Wv9=Af(V>8-2uMBn
zO18%_Y-?}G%7Rg)6ABE)?}7_Tn)?u%5?Aj~B~3bIRS#8)J7l&tji|LvI*o=EYmR&R
zZXeU4AD31`w7*VSY(a{=6SPf~WtdL*&pA?nrG47C-EnF)dBV-2B_L<h?wr`JH=LI_
z-XEv&9zq6C6$9uTF_Eo@<cYpgcieP!+3}5kn3yoV1@&E<tT;_>QBjm?o^+7e@rx2t
z?MEBYEY@s<n0A<RAN<Q+Ppo;jRiv1Xxc7Mm=AVXX^%Ino%KM)*h{xQd`aWtIG4~O9
ziG~?-4~uKC{cm9w_rsonX8hccE=qSqqL0cX&4Y~oy+n@yJpw8F0e4eo-zRq+qi%N8
zb8rOR7VR4x4gyeGblV$f%ouWu@ge>aA{Y~ImGq(b2Q+rl_&^#A#m~ip$kSg8D^XTQ
zJh5LA_Pa9o{hmJeey2)#I5FjKiAQt<DAzsADWjb7SV$!sNKE#SNM_G)U$P7v6H|th
zW*Fp@@%O;pH|9WD$pgR^L-Dyn0JTJ_1r$!0Czr6Iu9{I_4f&qSnvZx6{wW)5isfrG
z3z00aem-CB_r!#c7?hT$N`U1FAh-Jbow%T}SWaK_wto@rW3(TC<Y+n+|G(pq+IRbY
zt$Ptzx&9KRh$itK8kBHjq7ZdzMpQ?Q0P2WB)R-C$LUDzDoyO+KdvcDAYL0&v*F0|D
zgUk5E6z@CSZXLAY*{lwx?=5G7L+NKPV)o_Rpdh@M#S!EiDH2`AHXA29TT73tYOZRc
zURMYSN73uhkWd|>wua&~fY!u*?CHM|=wChQgfZiP_#eU@u8sY!HugsRv+rWQaxQ5?
ze6e2<?oDvi#^Tfhb_<gqF1(>?W7Un8TupV&nL#Yy>Q_l857>@Cr$Oknf3hb4X(;}j
z=#Yn$#{c|B*!Pk{p4bb(3}OJ2Q&Oyu(1*eP#u(^ax^2XZuE^1wMHKB~0A64KP&$Yw
zP#WrI4j@qd%zQO5FL#fG=PJxvQUsVSxeBu!7%|5|D(fsUJH8NBLo4uDTY;?h$^IC%
zZ~Ec`<XV#W;$alaY!GC-0@w~!&`lZx4#j_f%JcLWAd1F16n{`eG1kIX=n9yD^c^lp
z)W>X8Voby8mgW{T0}$P$7Y_JXw(xRh*2uZo$T`*wo7D|lYfcO?<PJdUH<fFlxkCT#
zG*YZkBgKh#tYs1{1`3#?rE0aZl$@6;-Wn1p>Q)rKLPUp<EqNkn^(;B@*stQZ5TOqZ
z)Rc=Nm?cl_P7!b?c05~Gr9pEjKK{GJd1rz^*zhX%SagA~CYV=B)#CJcJpBD}=dl+P
zW+*DL=>8ZQn0*Ccg4r!P%YpxK)G%xy+|}K7?nJ#~aA4%bgsbTuQ>uL_S*nw?P@4H7
zJ!B-#7K&h#1bvfeFv-z<O(<ZUvsg+gZC(C$qOA=NH_1D)0kdrhWP?3wZ>0O-`Y2?g
z*T-d>!P1Kg7emGa=mi7RuS8FnLr$HB;>03fg`BLbmt?9XT7gwOK^3>$J(hPrEvK{N
z8E<i~7c&k_LJ&7?Dg8GIviAhE^PxD+Y~im_gb|8Tf3<Xse-%-53`~?_2WzqMg--EJ
zLiQbY!fPk@+3W_$VOz{xY*pcD_k=_Ca%xv(D$x)RLbR#6(bc%%7_C4x>zgcQRD>n3
zNtPsEP=V}GVjjU_b%V(Eh*5i2xrd`GfDtx%hG#bH8lMzVn!*iAx8RBWEIurZ;6Ca^
zr;Iy~JuV8-ceujhDJJKtavXr4F^7;5h(bj!m#Cq*Q#2zQcwm;Hc%{hxS9H06C?erd
z{CTMW@_v{&k6lHkCZ^0csX0(*q4J3q_$~&i_@9B7Jcr_W$G{`^3+B;fqT;X?wU^_t
z))>ZCk_3n`f`;oDP1sPwMx8xFiZI76H5)P-|Bn-Q4U8%k>q&GIQME$JLy&_RqNVz|
z9;xpbOUQgmF}be6V<PPXoM_n(kebTnKP@LKH5fzCqN!|uiNjG$9^~wH5H)h05Jh%i
z{YBet1lHpH?MjH0|I1K9=+r?`?2GLhwC8LzVWYK^yh3Xif+0j}Jpwl*3kI27ur9b$
zF4$r&Si%@)T9)>Jd!dGy23LKucHrtmC7M=sT}|xys^*&5@2i@sVkeqC1D78Fy0P|>
zzd((g_|IO0{DXhVjGpg$k<-`4#xd!uj=fPeG4I!&zC#Yr@|U~cM7eTqxRIuJRX0`X
zhc<66$~%8MWhe5ZDi=+X?mHQGngn^~&x{3TEHGn%84LWKEa1R9Eh?xhOg97;n>cvm
z#(W2-64$->i5D)2nHUgG%5P_~q@l?jiJ?aT=(hiC{3y>;@T1%Kr{OmTKQZmbpRnfD
zoT1G=Ux)KbrzS6={z%jr4Rrdi)GjrGw#Dy@hQe1~sn_XvmyX`h(*wbXKOEH@3o@^|
z3;NeX?vh_mOdP=VLn9LtuL2$bJOQ}mm5GT50q6gAV&Z)$`Nr!L6J1cG^bJfH0P6tf
z0q+AW1>6GI1o%zBDB#n8y8vGYd;~BX<MBBtXer<dz)gS~0eb;&1N=GQUcj?QCngR9
zehKhZz^4GSV4T+h7Xuy#tOhK=w5}EKTEOjq*8@HXxC`(w;5PwZ1$+)L3x=JG`QBo{
z7Qkx2-GKB${XW3$fO{}?cnI*@fIk6z8SpK@*_bQML8Y7xxCC$~;0D0&1NH!Z9eWIW
z0DlK~0I&p`7%u?^08aou1$YiB^f|y4fJ-o$+yr<XU@ze9fO`QS2OI<(0UQC$#p;O~
zK{cQga3^3T;M0Jc07n6P0sjoR7x3)&ksfe6;0WM-fYg9q0(1ga{~qNAYyoTqydQ8o
z;G=*K0zLzH7;pseRlw`8Bb|fBmxnoYG2qRB^?>&Qb^$&PxD)VafDZ$H9QV|Q0h<8F
z0KWj3k9P7PU@_nrU_Ia*Y!N*IcpKmeK=m$+<I@eAqbJ|7V9xBEo#;@6$K7wO7wz*3
zK&^VVxaqJMm!<f%zl8Er$IH!Mom+5i-f3HNdbMlLzVgzgi!T6Hq~8cQ_%ie$1P78E
z#cx0GU6h^!<#i8!u2&}}Xop^N=jQ)Y#;Vh2)j=UdiT)6N7lSU|>rwP;P5MtjKO1zR
zucB`@>2HC4Dd^&TFh#%4q|bpqQ_+{4gbwkV<J0xE!jiMUTy~kJEl~C(yV36~*pu$}
zXXT-hC>=a~nVAPOg(6hWUC8G*==Z70@d)T+phLbXqs1)8bD*C9Jy|)ONJ4fb`X>Oh
z9)NX~9vL1Xgm`G7Nw)A1&z%;YJIs7mz(1`6&mG_~Wo$9Zu@Us|f_|};ev_I0Hqajh
zo$O+yzrm#M0sV2%>2W55USrY^fc_BZ$;wG~e2M8;LYV0TX8IGLKY{dlR{Cm_eh%u1
z^j~hJcbW7RpdXn+zm1@i{zcaGeltDom{a<zt@Nl#-vj#Vpwl>Els{vQV4(6H0R2oX
ziOQ|?PBZ;Wpf3SES^3E3kD&bLfc`#U);Hj6RXcE_J-(h{<g+*nJM0)olF1=G=snqA
zfS$_Et)P#BeyLSYuc_yD(BA_+Sv?mb%|@~V=-s%^T4d3)W+3yPtiJ3FpDAk?vWhX5
zEwtvfLDHSTje%YQ`k0lD@@BkZ%9w|7>jUuAT6r>72?i=}Dd?BdIB2Cq2DN;mH-Wwm
z^kr6h(3Bqq-4FVwtn`d}L;fz%zY2OXdpMEi5zu#nzQLNF%3$$@N5OL#JgIDc9Q0>E
zPt`XIVCMe@eT6lTh*^dT(8oc)$V$J_q;CM7<_f9Weh=ugFgHlmc6Wn*4(O@sU_a;{
z(95j(cbNGf1^p(_7g_1ooAl$LdqGcTW2(af%pDp*Pvu)GKyLs&V3nV-Nn}9jX^ygI
z3i&j5`5fr0t?6%<`Z<By4f-RXw^`|H?#b-S%J?L4Q64nkISwB2KD6q)&1M;C9`py$
zQ`v~-N2g)_l`1{Wn@$D2z^cP~Q$Ee7GC)seLnqQ~1pR&Fzr~uK+G)F#u^4!=<7&uw
z4A)tWa73yc#P@A8uYHj93-H}wm6g#T7|0e!K%a{p=p|OV#V^D`KM!=u)5yaz*5qI~
zz5w+3R(h?Oe<A1}2i;?(L&r~ph4S}+ehuiUe0e+Q8|`$Y4{`dtK;H&>GFy`TeV})M
zp3onV{xh8Z2<W>(Po;kx^w0|v6L>DxEPqC|D4-Ll9LzcQfG*zMS9Ks=5~a|{*DV20
zJ@W7qVs&o*=QC<@3qF_W&UFrCxpNEe$*#^V{-;@<+={;0YjP`txfNBp#Z|e5D|4MI
za|>4H=C91n5#LLQLQW6lzy%orkJg>J_Ga9>b7X&gw)WkOXEI-&r9CklETD)VuD*Ps
zW5HF|YuZm7<BpfJ9LF*X@uGJ2!e_I!-)4go(;g9!B4;(80QOVrFQ4VeyCQq5;|q>c
z@xak{92Pz!^O=ho3(Qzx#sV`In6bc&1!gQTV}Th9%vfN?0y7qvvA~D5fY@NbH<7V4
zrJ%mEr`D!+NPDLg6rR2TNMViyn&y-jr*pjLN_ipH>9|zi0bIazb*}|)sEMGy`$u~S
z6tE2<!tXzr2+_q}?5$G3vRDM#kD-9=F%gb39ZPl*<}xF0F^M4dhHxeJ9T-P|lWE6;
z5{aE15eoT&?gLTa>@@W)wf(t?_zEt#(p%hf0t>b?L{RyQJuF}pA0BxZ<%3-X3Xdp@
z_--~VZsm%&(i682Mf_T(<Mx<{b9HcT=>zI}>UN36PKl{xSkG`1!!Cxs40kfz%kW`_
zgA9imjxZc!s6J+xzfcpw$*`DVCBu4#n;3R6>}9x<p_DTHzv{b_N&-F*dZn|lxurW8
z?RGB5*FlRfcXf-H>oX<AMa88>u1h3drAQ4k(Yp#%MO)bD>!1}$@tK<X4!wmPZ8}T4
zEh&Gt(T@`OXKAYcB;sd_eNQWnt~1e8Rr|BjGSS*qyG_KOs;T~wh(AqJ{U{NS#_hCX
z<aH)aQ*b85=W2VC)NiJir#+k$pKqMoV9g~HU$9nwD-n;~&PpprUT10_Grr4jrDo!;
za_=fDM)A+ra@HlopK16<D>W0|M)}1=e1Vp%UuJ4&8GbC0A6~Z0ijmiumhXXEshP%U
z5+>Gx%Pj5P2@@l)$PHy+x3qpyB8A)|;d^a(st&@d@kXVhdr%JOvp*~`lph7U@8C#c
zhfd&0j{2S~@lx1^pF^8$&)vW$)Bh2{KUd4C#07%N_h;gIu69Z#VNLw2;(D%jY9(P!
z{JY|Mf%ce9e-h}xm-XjV5;y4}!^Moxuat3>?{dbUVZ+l44kTx;4Sy5k=P_Q%zn$>~
zHvCr@e~t}LuPT##rwu>C_{BDSHpWNdFSOwoGk%E;znbyIHoTAVr8fMXjIXfa_cDHk
z4L`*AN*n$i##h_$ABTS={XI53oeMztS{uHN@%1+R=NaE*!+)Rg8*KO&8NbnnpM{E}
zd^g$f^x8k+TW$F38Q*Th-@*6}8=l@#BspC+{Bw+t+VCeB-($mHAnr@d)o!!lYZ%{a
z!{5sI?Kb@VjK9l<{}JQwvEhHq_?<TVoLRD5yKMLp#_zV_H!yyW4gZgf-)qBvhw%^E
z@c+g5eT-N0xjzC=e(A-FH4(&pEL>fJ2_)gwd|S<n(Y=!TrK`d3z&Mh%PRg<VP3Fhe
ziV5oE@#wq@DBsEB=}kiZ<Z+};$T@S;xI_1W@f63TajIX)nLO@10Q?0g@7J!7nwM|^
zPJyA4okMXh@WekXZUSps8}s)v{??@uv4Zih06&+nt>6YsxJgdaB@!#{i{tXkjDNf^
z5kJQGIpQHDJafUqHk>Nuyme_J|8C&(wX?MX+j#ml<}c)YS)7)E1qkVLH<xQQ%YO)p
zIU&bBF8i2&d$D9($oxlvKZ8nKDG}(NBD}%;U$T{p=4Z+5(*-=~^C%Zo@w}FTKNIU3
zG@(i1eBf1DE0NY^b0Pnf#S%-;kWuJN!T&=cKVQ36JcOcYk4bjznp~-8F5CYO#{VDS
zljVCD_%kxj*6j21XIaiWrBZ-u4`<~``PXv0x|rpEhVgd$zXyD>d@C^pA$wMoO9AJx
zoO^*!#{W_Z{-QIaoZZW%fLfNb7x;xJm->D@n!N}IfKMiW4v?hJp$f_8Vg44zKh5n^
z;h#gn=VnZv_dd-0H*>q<Zmc~GJpNA#XX0T;%J-YxZpWA-%6NN!e1!42%-_xYg;;Qt
zobPhIT*Ua#F#ct(N2SA08NZ$J+|9JQ`O?p*@8~PLZ33R;U$;aG=4#V^#CW@Zn0sdO
z@-9h%-wQm+xBHVEY#b2&#uAzDr?^}}#z(ncKEe1yj6ZUjM2P1#aCs#*5>(llRe*KC
zll&LBzbKxE(cq~+Iu}W9RgXVl{?oX<DfxwGC70is0^geg|J4-u9{^8!KF9T2#D$m#
zS4jAstRT9Z2)8ia?%$pPUX_tq1$Yy9@&k7N{0EkE5!;{LjdmS2q$uA#+#cu|Lkh<j
zZ};cxut7!q72MubKRv?uKePTyH`lq6e;wE3aw%TBA9%&XC;?7|14(v{(n8=V-!HKo
zWw))2znl9zyIpOF@%M1Kls#9WJ&>I5GC!KV2)h{nM_YgY7VtFQ*~h5}{CqyvYlC4H
zz~J9SdrQPo%;A*z+WpUZ#=pS%BDn}(0X~_1{vCMIe+L^<@tlW&lJFT^FXd8<_6^`k
zPkVd#9`oDB_jiC-GFgNG^DqcfKhmz0SU(f#DRRoUiuFhL7hwtGbIW8r!1x&BkCaHn
z2F7Qgf0CT1SaGGpCm8=Mw=3@6+8*E)4~r0Be+v9BS<Zbf$*tP$WoR&zZx8FKWbR`8
zEN(yboXM+<e~$Is!uXmCrJPr}UR1mK3gi2kAI(yP4;cRn>r>76d*Sd&pMT(XuG;M=
z@J{Bjf^3Xm#Q!ap54R>lD~)SbM!vp{`R(oPKLvlj*2VRk&HOJgzuiAyf%>L=ALsT4
z^NO&G@gBB=@~dw%-tOP3;rB_-=eS~#oe1|a-pPJa)%yp)Cp(vI9vDc@qs)kE5}^h0
zWOlfd`7_xMi02iN;tk-PGHJr!KQRAwY-esRS{v$#^8F;Ye+=s)bOG<=Yb&^$`ETNU
zxp`^7W&B-$6qL?u!9eou?cwXdD;}AG|N6qwNVFSo()u*>Tsb}3sr%?;xd_g9Y7Oaa
z9ibL)hu(@8pCY=qyGQedI=edjQGaVuMR7UKm`cW^<D95~?hS{%+jM_08s3Iuq`aMe
zy|ufua~qOaVt9ip5KUn9b@*`-9S*39(y?>&UY1U8Viozix^S#lW3<+_yer`M`L_ll
ze%%)e;$X9GA5Pm6q^07uEHAW$%r=QE53~hCVVqe-92L4A2vMGhu3L^X_JZAXgk5ck
z3nlSy_I7tfb-dVhi{2IL2>7<iv)O80#i04ag2FkuP%J$fvZ9K;t)VWNoTTG`voajY
z=kJkPdV|qOZCSvFqu@wtQgV4P6zOh>q@-6sexMTvaHZr}<_mRn;0Ujj46b7Ls=BI1
zUpUYeP03Uy4!uiBuJHFnaju!aH6_PV@77yV5=+~{{;rhNGXIohSBXw_vn?3v40t<I
zN^vdir1SDrot~607>epGp_9uaKL20I*Am+LSLN%Uf^WGjhBqjhV$w$ADwRB>TClrg
ziZYZ-Ms%7ej-U;7r*3F2X6*|5x9Hw2I3q4~NlG_|!Vr9NQx`L1WQvC4QqqDUbepaz
zO1+GwZ4QJZQ9Tswn1XkyIPww#PtrGB%op9(HAN39kzD?W&pQ<xXWD|3_QL+nsqM#^
zwuHfWQm!s<*dIJ87s;BU0Hv}3IB4=DjI3JN8*H0G9CJmwrp&a=$P~Tb?@jG|T;*H~
zVx+-Ygi}<k%9@TdWUIi#xB8=Ku^1@yMvO08aAaYt7*PCrK)9?@IC~yoXefXm349vE
z4NhD2;<&_Cy!=Z;3dM^90Zm_1yMAR=Es}J%=ry`H?NHZOH&oTRb@#ezI-6C;VavMP
z!$?ncgQi!1YF$-b%_`6mM!42QggD*R$YK5J)s1ePuv)dU*3IXTy2{NFjZbIA*`BG-
z?o4_7rrYhNQH-Vean@$JUXPPCJNcx?%1Uu^BMxDtqZh@Ar0}$1KL$e#C%U($ema>V
z6R3-9+-{s3+oU(5N-FdQ91Y7uV2NH$<_>M6$wf_VOR?LHQCU<$XxsGibi22Rj6@;+
z9*3l|>C~Z@m=s}jpJpXhmxz;Z$<deS*NGR-#p$urs&Zv|K2A5iLUCX%>leU`P;d1{
zy+%IMEc_}_ct43Unn${9V49QERhpVJK-bcsf2%xqm!^-?vbVd!=w6-mYw46;%M_Ci
z_{BW56SL_64saHkOs`Kx%jK5{YPffmqj$McA{?)~4E?HRJ)9fPbJe>s(0;^rG>*JY
zc6@O{zaT}{TFm{?TBDRsT&A96KR~&%(sClLq^d)0?H+I%{!Ugxsh&`BI^dT4>GaOB
zP7oz)lMeLm4jLnu9|sI$rX-!GYpJ41B>c$I&d?T&e1B7$bHi~f7rshwT*>ZRHN<I{
zijEi;<HGW;?npa)90qVS;4+dt#hK+cbLg#+klv0_n@&MTx(YR%Ot1fLsc#Y`v$WH6
zxP^p)w;sJ#9Aj<C<fH8jaI5SZ{)S`G6AUk#G>s=|X<SecMv;vhrbu?Ae9CzZd7zI}
zH<!}QQeWw46w8gh3-aUF;s-BI4Y0avJ?2VU^(Tw4Y$(DO#mNp}|4@gN)5vGDSGnjq
zu3~+a+D(~O@3~fVXG;mhCPzA41W$?@qf0vj!J1CAW=!x(=)bMad?-t#YB_5gjRE#=
z@oSiH)Zv76KlYM3uw=u8z$dhw=G=%!TI+=3BWQ*26ZUpU<92#1Qqk${(Id2bMN=a<
z4=lsHt*v2yBof*@ttRJ2ccF!h5Lbfr%H&}YE0=V+Da@|OI&vXXMQfP`3}RTzxM>X)
z2y(YFmQZLt>1h3~GAv^;mopY)AH@nm%)G^939CJ<x$R|XjCMq7Qaa%VGO^e*)I(U!
z`MsSrSjHCXZkViLZR%dsSgONmkOkc~pClJi>0AUQTdx?4t+dpW3-zQH%x+p+W2sB4
zUD-fk1#=BT)6;3Scr|OXtP|6?Ltq_~%Jgf+e2hjVu~}e@O#1b$9rcNWi0FW&G|{YS
zNL}42+mGA5p3GAo_KUeH#yvl_xqN=KB6q3Sj9U*w3ik=F^zDe)$}0x2$)y!-AL_Y$
z?YJ|+yD&aCwooRudLI;q6!`jt_6*H~TRZKjb&a8ph`WAuLu05>PwW7=Vd4*U`IEVn
zbe7*-uTl-$XQk;FUof>3>)^CDezPCZT3&Vq+R(#L>ubM7-;B*a(|?s>5!h6>N^RAN
z8!zeGTEQ4rr*;keZehVRdHrRRJ~LQjc1Y_qAlc%9oX;|NRhmnxev`&R8=}NsimYPH
z&M*LXdT;UTa>2P7+a3NMpC7lxjK$}4d;|3|+Y(lERE#fjxz-q^djVU$;ntd|2Aiq8
z52g{x9T40Odm#2cM$!Lf7mlTR9d<g6EhD)bOB18;UpLeSCJnWz%{%o-i_HmNs9O#C
zawAi?baOpDy{#&2{L*GDW&t!)!i0dWXs^)Jice8%U0uqzq-FS~y9Kz1)*cRRHHUFD
zzGZ=66!#>tZzoo4xQ|=YQrqHki?U#9JgId~t1zu<R*V~$J+?KD*jyzChfj@)Mb(F0
zBqv$)@dPU!mQzcU^f*4!mG|w!eo-5Vdv@p_Mt7f<rQs1%b#~0|<#a=uIUUAvVTw!7
zNTiq;PTjurmPo`>@6%j5N{iCKR}lzB#O;LA8r=7q)GA>1jzEh~_eHwJT`O1t)0TQ3
zz|&$4nj;&_K(X3HCKx(O?Q6G5W5`Ew2MX55m<3bd*$&~)r{(2v%MKn+l;dyom+UOL
z`Qs4_9XH>6?TH;A$%arO1J!goDyyu;-xdf;A4`5NjYHpZ41I0>sNU42CQqSoI-U7a
z*iOiU+Gtv;^XXpQHNBgPm>#R8mt6lR+{jGZC}7;T5YyB&u5`@37uD%3Q^?N1G>u|3
zGd7RdJ94jU)RW!pk~;#fvev+6Uf$_qX$OVedZR~7Q`5Z1Do3W8CYopAQ7lZD{9&vG
z==ruLZ;QBRkZ@Bf9V;Hw((s$j!L0%G;s$@DyVH*~$*NWQ(xT$@jV^L|WqNIT@J>ok
zV{cXCdzvd}nYzo5#v^yWg$woHI=#l|Zb3tuG#R7yxV&4HX6@CWiagWdYfFuNQ8@2(
zF7Zt((?b|)Cr4O%sn6K6G^YG%8ae_!v=)}_Qp{V$V>9NW+<3+WwPHBuP|MAD-~k)9
zP~yYRaB0^?IcT~CQ4cTOARaT(+dA<ek5obq(yofh+r;VIB)0Al!_c%@)`*<R4W)2G
zjnSb0qiOYMOn;YQJI%ez+}=!_vXU!G+(y%iBHKEn-WI@USi*L7MGLoZmsW&ToxccA
z`WAJCv8fi0ZnL0Tx&s}pmj_xIRkgC_@~F2>6YTBYNV`_lx-Ez-B#ee7Ws5%?!Si7W
z5gjzNWiKV*tFDfyRzx@IwIW;=wT1X1;`eDq*k?ync<iEZNIdjX<ZtJPquW~{O~sIn
z&!_WV3d)FqOGuT^O-mW-8ku`L0}w9-VgaEQ;hF7DJcpg|^!KzQ^j#VUei<yGAX8o_
zh}U}(0`Xb{J^w!&Kb5|M(<|t-GUGbO7N^JRt?4&#dIi<<{3=<Nx?osiORv8Fp`g>o
zZ%_Y8&~cPQNa?S>x1%6Gu5WyA$F9E@=}*H?r9aC0E4Yc{R7SF$O0VA62mnVcD!ux?
zkAmuZK41~SZvR_>rEle^^z{BK1qD4aYUr={6#NY6^z9uLSKl#Fa4)AP{Yg)yKclq)
zT#!vwdiA{%1$$ZHS)4$nSLMH-)30Q@`Ywus>U%_#{w)0L=^p@&vQY9hCn6M_;xFES
zN(}b;-vhEWz0)cA6;%2wDR>h~1O@*EF>8ADJr)HOA1A{b9Wp5Rur0m%Zi|A}GdL)v
zN~hpsw)E<KM+KYlKBYCNbPD$*F0A^i@4zUC*OU{2-F`pj^s4@~N@i#HK1UT$^{3)L
z#RcWB(!XCRDGJ`B2oi04M}(u=i@30+SKpse@UBx8r-b(WU$doG->Fg1?q==f{~x5s
ztJb1#P!8&QU&9+IX4aRCneA8UkqPDLT`zVM#`gzCIKARal|B#YsqL%u>ialxl|S2$
z`Y#0~Pql+@14s5#>DBkZj^9Mw;3uLbsPqaJfNo81yf3*)CQ|)eiBRcO{5&KkoXWql
zi_>kAu-8`ql@e|IUBvkp8yI;}ut`FGt;(R1t`w}$mW7~uHVL57d(#x?{XHE&rN1iI
oDZnY+acP8o-STosz39ahm9Jtv0~g8ChrTX3=u8`Hutl~12iOU-{r~^~

literal 0
HcmV?d00001

diff --git a/test_scripts/ofccl/run.sh b/test_scripts/ofccl/run.sh
new file mode 100755
index 0000000..c3a25b1
--- /dev/null
+++ b/test_scripts/ofccl/run.sh
@@ -0,0 +1,46 @@
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+
+export DATE=221221
+
+export TRAVERSE_TIMES=10
+export TOLERANT_UNPROGRESSED_CNT=10000
+export BASE_CTX_SWITCH_THRESHOLD=80
+export BOUNS_SWITCH_4_PROCESSED_COLL=0
+export DEV_TRY_ROUND=10
+
+# export SHOW_ALL_PREPARED_COLL=1
+
+for MY_NUM_DEV in 2 4 8
+do
+    unset CUDA_VISIBLE_DEVICES
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export RES_DIR=test_result_${DATE}_${MY_NUM_DEV}cards
+    if [ ! -d "$RES_DIR" ]; then 
+        mkdir $RES_DIR
+    fi
+
+    for n in 16
+    do
+        for w in  2 
+        do
+            for m in 1
+            do
+                for iter in 1 2 3 
+                do
+                export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
+                ## Time
+                echo $(date +%F%n%T)>> $RES_PATH
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M #16M 32M 64M 128M 256M 512M 1G
+                    do
+                    ## Test
+                    /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH
+                    done
+                done 
+            done
+        done
+    done
+done
diff --git a/test_scripts/ofccl/static.sh b/test_scripts/ofccl/static.sh
new file mode 100755
index 0000000..3a65584
--- /dev/null
+++ b/test_scripts/ofccl/static.sh
@@ -0,0 +1,21 @@
+g++ statics_ofccl.cpp -o statics_ofccl.out
+
+g++ statics_totalCtx.cpp -o statics_totalCtx.out 
+export RES_DIR=test_result_221120_2cards
+export OUTPUT_PATH="./$RES_DIR/result_statics_all.txt" 
+echo  $(date +%F%n%T)>>$OUTPUT_PATH
+for n in 4
+do
+  for w in 2
+  do
+    for M in 4
+    do
+      for iter in 1 2 3   
+      do
+        export INPUT_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_M"$M".txt"
+        ./statics_ofccl.out $INPUT_PATH $OUTPUT_PATH
+        ./statics_totalCtx.out $INPUT_PATH $OUTPUT_PATH        
+      done 
+    done
+  done
+done 
diff --git a/test_scripts/ofccl/static_time.cpp b/test_scripts/ofccl/static_time.cpp
new file mode 100644
index 0000000..c079845
--- /dev/null
+++ b/test_scripts/ofccl/static_time.cpp
@@ -0,0 +1,32 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    //cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]<<endl;
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+     cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]<<endl;
+    string inputLine;
+    vector<double> a;
+    vector<double> b;
+    string ss="bandwidth";
+    string str = "N/A";
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
+
+        stringstream line;
+        line << inputLine;
+        double tmp;
+        line >> tmp;
+      
+        a.push_back(tmp);
+    }
+    cout << argv[1]<<" time: "<<endl;
+    for(auto a1:a)
+        cout<<a1<<endl;
+    cout<<"************"<<endl;
+   
+        
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_time.sh b/test_scripts/ofccl/static_time.sh
new file mode 100755
index 0000000..41aa407
--- /dev/null
+++ b/test_scripts/ofccl/static_time.sh
@@ -0,0 +1,21 @@
+g++ static_time.cpp -o static_time.out
+
+
+export RES_DIR=test_result_221119
+export OUTPUT_PATH="./$RES_DIR/result_statics_all_time.txt" 
+echo  $(date +%F%n%T)>>$OUTPUT_PATH
+for n in 4
+do
+  for w in 2
+  do
+    for M in 4
+    do
+      for iter in 1 2 3   
+      do
+        export INPUT_PATH="./$RES_DIR/test_result_"$iter"_n"$n"_w"$w"_M"$M".txt"
+        ./static_time.out $INPUT_PATH $OUTPUT_PATH
+     
+      done 
+    done
+  done
+done 
diff --git a/test_scripts/ofccl/statics_ofccl.cpp b/test_scripts/ofccl/statics_ofccl.cpp
new file mode 100644
index 0000000..462fffe
--- /dev/null
+++ b/test_scripts/ofccl/statics_ofccl.cpp
@@ -0,0 +1,36 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    //cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]<<endl;
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+     cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]<<endl;
+    string inputLine;
+    vector<double> a;
+    vector<double> b;
+    string ss="bandwidth";
+    string str = "N/A";
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
+
+        stringstream line;
+        line << inputLine;
+        double tmp;
+        line >> tmp;
+        line >> tmp;
+        a.push_back(tmp);
+        line >> tmp;
+        b.push_back(tmp);
+    }
+    cout << argv[1]<<" algbw: "<<endl;
+    for(auto a1:a)
+        cout<<a1<<endl;
+    cout<<"************"<<endl;
+    cout << argv[1]<<" busbw: "<<endl;
+    for(auto b1:b)
+        cout<<b1<<endl;
+        
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/statics_totalCtx.cpp b/test_scripts/ofccl/statics_totalCtx.cpp
new file mode 100644
index 0000000..e52146e
--- /dev/null
+++ b/test_scripts/ofccl/statics_totalCtx.cpp
@@ -0,0 +1,51 @@
+#include"bits/stdc++.h"
+using namespace std;
+int main(int argc,char* argv[]){
+    cout <<"totalCtxSwitchCnt:"<<" " << argv[1]<< " " << argv[2]<<endl;
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    cout << argv[1]<<" totalCtxSwitchCnt: "<<endl;
+    char c;
+    int cnt=0;
+    int sum=0;
+    bool flag = false;
+    bool flag2 = false;
+    string  a ="totalCtxSwitchCnt=";
+    string b="bandwidth";
+    while(cin >>c){
+        if(c == '!')
+        break;
+        flag =true;
+        flag2 =true;
+        for(int i =0;i < a.size();i++){
+            if( c != a[i]){
+                flag = false;
+            }
+            if(i < b.size() && c != b[i]){
+                flag2 = false;
+            }
+            if(flag == false && flag2 == false)
+                break;
+            cin >> c;
+        }
+        if(flag){
+            cnt++;
+            int tmp = 0;
+            while( c >= '0' && c<= '9'){
+                tmp = tmp*10 + c -'0';
+                scanf("%c",&c); 
+            }
+            sum += tmp;
+        }
+        if(flag2){
+           cout << (sum * 1.0)/cnt<<endl;
+           cnt = 0;
+           sum = 0;
+        }
+    }
+    cout <<endl<<endl;
+    cout <<"*************"<<endl;
+   
+    return 0;
+}

From f505977b4a3df63f61458ff57757695f24e4fbf8 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Wed, 21 Dec 2022 10:27:58 +0000
Subject: [PATCH 071/109] +order

---
 test_scripts/nccl/run.sh                 | 7 ++++---
 test_scripts/nccl/static_nccl.sh         | 5 +++--
 test_scripts/ofccl/clear_static_ofccl.sh | 5 +++--
 test_scripts/ofccl/run.sh                | 7 ++++---
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/test_scripts/nccl/run.sh b/test_scripts/nccl/run.sh
index 0e63f35..8b92e0f 100755
--- a/test_scripts/nccl/run.sh
+++ b/test_scripts/nccl/run.sh
@@ -3,6 +3,7 @@ export NCCL_PROTO=Simple
 export NCCL_ALGO=Ring
 
 export DATE=221221
+export NCCL_ORDER=1
 
 for MY_NUM_DEV in 2 4 8
 do
@@ -10,12 +11,12 @@ do
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
-    export RES_DIR=test_result_${DATE}_${MY_NUM_DEV}cards
+    export RES_DIR=test_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
     if [ ! -d "$RES_DIR" ]; then 
         mkdir $RES_DIR
     fi
 
-    for n in 16
+    for n in 32
     do
         for w in  2 
         do
@@ -26,7 +27,7 @@ do
                 export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
                 ## Time
                 echo $(date +%F%n%T)>> $RES_PATH
-                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M #16M 32M 64M 128M 256M 512M 1G
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G
                     do
                     ## Test
                     /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH
diff --git a/test_scripts/nccl/static_nccl.sh b/test_scripts/nccl/static_nccl.sh
index 761ff36..9cf7ad2 100755
--- a/test_scripts/nccl/static_nccl.sh
+++ b/test_scripts/nccl/static_nccl.sh
@@ -2,15 +2,16 @@ g++ static_nccl.cpp -o static_nccl.out
 g++ static_time.cpp -o static_time.out
 
 export DATE=221221
+export NCCL_ORDER=1
 
 for cards in 2 4 8
 do
-  export RES_DIR="test_result_${DATE}_"$cards"cards"
+  export RES_DIR="test_result_${DATE}_${NCCL_ORDER}_"$cards"cards"
   export OUTPUT_BW_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards.txt" 
   export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards_time.txt" 
   echo  $(date +%F%n%T)>>$OUTPUT_BW_PATH
   echo  $(date +%F%n%T)>>$OUTPUT_TIME_PATH
-  for n in 16
+  for n in 32
   do
     for w in  2 
     do
diff --git a/test_scripts/ofccl/clear_static_ofccl.sh b/test_scripts/ofccl/clear_static_ofccl.sh
index 1d70cb4..dc8646c 100755
--- a/test_scripts/ofccl/clear_static_ofccl.sh
+++ b/test_scripts/ofccl/clear_static_ofccl.sh
@@ -2,15 +2,16 @@ g++ clear_static_ofccl.cpp -o clear_static_ofccl.out
 g++ clear_static_ofccl_time.cpp -o clear_static_ofccl_time.out
 
 export DATE=221221
+export OF_ORDER=1
 
 for cards in 2 4 8
 do
-  export RES_DIR="test_result_${DATE}_"$cards"cards"
+  export RES_DIR="test_result_${DATE}_${OF_ORDER}_"$cards"cards"
   export OUTPUT_BW_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards.txt" 
   export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards_time.txt" 
   echo  $(date +%F%n%T)>>$OUTPUT_BW_PATH
   echo  $(date +%F%n%T)>>$OUTPUT_TIME_PATH
-  for n in 16
+  for n in 32
   do
     for w in  2 
     do
diff --git a/test_scripts/ofccl/run.sh b/test_scripts/ofccl/run.sh
index c3a25b1..f7158da 100755
--- a/test_scripts/ofccl/run.sh
+++ b/test_scripts/ofccl/run.sh
@@ -3,6 +3,7 @@ export NCCL_PROTO=Simple
 export NCCL_ALGO=Ring
 
 export DATE=221221
+export OF_ORDER=1
 
 export TRAVERSE_TIMES=10
 export TOLERANT_UNPROGRESSED_CNT=10000
@@ -18,12 +19,12 @@ do
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
-    export RES_DIR=test_result_${DATE}_${MY_NUM_DEV}cards
+    export RES_DIR=test_result_${DATE}_${OF_ORDER}_${MY_NUM_DEV}cards
     if [ ! -d "$RES_DIR" ]; then 
         mkdir $RES_DIR
     fi
 
-    for n in 16
+    for n in 32
     do
         for w in  2 
         do
@@ -34,7 +35,7 @@ do
                 export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
                 ## Time
                 echo $(date +%F%n%T)>> $RES_PATH
-                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M #16M 32M 64M 128M 256M 512M 1G
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G
                     do
                     ## Test
                     /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH

From 1c9a007bf2a73d90f2d4227be1650c5ef765a52f Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 22 Dec 2022 09:27:09 +0000
Subject: [PATCH 072/109] 28 is occupied

---
 ofccl_test.sh                            | 6 +++---
 test_scripts/nccl/run.sh                 | 4 ++--
 test_scripts/nccl/static_nccl.sh         | 2 +-
 test_scripts/ofccl/clear_static_ofccl.sh | 2 +-
 test_scripts/ofccl/run.sh                | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 3465366..073c8d0 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -42,10 +42,10 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=2
+    export MY_NUM_DEV=8
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=0
-    export NITER=16
+    export NITER=8
     export NBYTES=8K
     export WARMITER=2
     export MITER=1
@@ -95,5 +95,5 @@ elif [ "$RUN_TYPE" == "NCU" ];then
 fi
 
 echo cmd=$cmd
-$cmd #> /home/panlichen/work2/ofccl/log/ofccl-2ms-coll-master.log
+$cmd #> /home/panlichen/work2/ofccl/log/ofccl.log
 
diff --git a/test_scripts/nccl/run.sh b/test_scripts/nccl/run.sh
index 8b92e0f..198528b 100755
--- a/test_scripts/nccl/run.sh
+++ b/test_scripts/nccl/run.sh
@@ -16,7 +16,7 @@ do
         mkdir $RES_DIR
     fi
 
-    for n in 32
+    for n in 8
     do
         for w in  2 
         do
@@ -27,7 +27,7 @@ do
                 export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
                 ## Time
                 echo $(date +%F%n%T)>> $RES_PATH
-                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G
                     do
                     ## Test
                     /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH
diff --git a/test_scripts/nccl/static_nccl.sh b/test_scripts/nccl/static_nccl.sh
index 9cf7ad2..d8f5883 100755
--- a/test_scripts/nccl/static_nccl.sh
+++ b/test_scripts/nccl/static_nccl.sh
@@ -11,7 +11,7 @@ do
   export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards_time.txt" 
   echo  $(date +%F%n%T)>>$OUTPUT_BW_PATH
   echo  $(date +%F%n%T)>>$OUTPUT_TIME_PATH
-  for n in 32
+  for n in 8
   do
     for w in  2 
     do
diff --git a/test_scripts/ofccl/clear_static_ofccl.sh b/test_scripts/ofccl/clear_static_ofccl.sh
index dc8646c..2c3849b 100755
--- a/test_scripts/ofccl/clear_static_ofccl.sh
+++ b/test_scripts/ofccl/clear_static_ofccl.sh
@@ -11,7 +11,7 @@ do
   export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards_time.txt" 
   echo  $(date +%F%n%T)>>$OUTPUT_BW_PATH
   echo  $(date +%F%n%T)>>$OUTPUT_TIME_PATH
-  for n in 32
+  for n in 8
   do
     for w in  2 
     do
diff --git a/test_scripts/ofccl/run.sh b/test_scripts/ofccl/run.sh
index f7158da..ab10f96 100755
--- a/test_scripts/ofccl/run.sh
+++ b/test_scripts/ofccl/run.sh
@@ -24,7 +24,7 @@ do
         mkdir $RES_DIR
     fi
 
-    for n in 32
+    for n in 8
     do
         for w in  2 
         do
@@ -35,7 +35,7 @@ do
                 export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
                 ## Time
                 echo $(date +%F%n%T)>> $RES_PATH
-                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G
                     do
                     ## Test
                     /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH

From 54ff526838e821662cac1be18a4c3bdd5161f6b6 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 22 Dec 2022 09:57:43 +0000
Subject: [PATCH 073/109] fix bug in
 nccl-tests/src_manual_size/ofccl_all_reduce_ms.cu

---
 src_manual_size/ofccl_all_reduce_ms.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src_manual_size/ofccl_all_reduce_ms.cu b/src_manual_size/ofccl_all_reduce_ms.cu
index 74f4866..ccde169 100644
--- a/src_manual_size/ofccl_all_reduce_ms.cu
+++ b/src_manual_size/ofccl_all_reduce_ms.cu
@@ -166,11 +166,10 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
   return testSuccess;
 }
 
-
-
-#pragma weak ncclTestEngine=allReduceEngine
 struct testEngine allReduceEngine = {
   AllReduceGetBuffSize,
   AllReduceRunTest,
   AllReduceGetCollByteCountList
-};
\ No newline at end of file
+};
+
+#pragma weak ncclTestEngine=allReduceEngine
\ No newline at end of file

From 57875d6dac337d1e422d02564a40626b345c3908 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 23 Dec 2022 06:28:52 +0000
Subject: [PATCH 074/109] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E5=AE=8C?=
 =?UTF-8?q?=E6=88=90=20auto=5Ftest=20=E5=BC=80=E5=8F=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |   4 +-
 test_scripts/auto_test.py                     | 186 ++++++++++++++++++
 test_scripts/nccl/run.sh                      |  39 ----
 test_scripts/nccl/static_nccl.cpp             |   7 +-
 test_scripts/nccl/static_nccl.out             | Bin 43920 -> 0 bytes
 test_scripts/nccl/static_nccl.sh              |  29 ---
 test_scripts/nccl/static_time.cpp             |   6 +-
 test_scripts/nccl/static_time.out             | Bin 43920 -> 0 bytes
 test_scripts/ofccl/clear_static_ofccl.cpp     |   5 +-
 test_scripts/ofccl/clear_static_ofccl.out     | Bin 43928 -> 0 bytes
 test_scripts/ofccl/clear_static_ofccl.sh      |  29 ---
 .../ofccl/clear_static_ofccl_time.cpp         |   4 +-
 .../ofccl/clear_static_ofccl_time.out         | Bin 43936 -> 0 bytes
 test_scripts/ofccl/run.sh                     |  47 -----
 test_scripts/ofccl/static.sh                  |  21 --
 test_scripts/ofccl/static_time.cpp            |  32 ---
 test_scripts/ofccl/static_time.sh             |  21 --
 test_scripts/ofccl/statics_ofccl.cpp          |  36 ----
 18 files changed, 197 insertions(+), 269 deletions(-)
 create mode 100644 test_scripts/auto_test.py
 delete mode 100755 test_scripts/nccl/run.sh
 delete mode 100755 test_scripts/nccl/static_nccl.out
 delete mode 100755 test_scripts/nccl/static_nccl.sh
 delete mode 100755 test_scripts/nccl/static_time.out
 delete mode 100755 test_scripts/ofccl/clear_static_ofccl.out
 delete mode 100755 test_scripts/ofccl/clear_static_ofccl.sh
 delete mode 100755 test_scripts/ofccl/clear_static_ofccl_time.out
 delete mode 100755 test_scripts/ofccl/run.sh
 delete mode 100755 test_scripts/ofccl/static.sh
 delete mode 100644 test_scripts/ofccl/static_time.cpp
 delete mode 100755 test_scripts/ofccl/static_time.sh
 delete mode 100644 test_scripts/ofccl/statics_ofccl.cpp

diff --git a/.gitignore b/.gitignore
index 5999837..81a260f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,6 @@
 
 .vscode
 
-test_result*/
\ No newline at end of file
+test_result*/
+*.xls
+*.out
\ No newline at end of file
diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
new file mode 100644
index 0000000..06799ae
--- /dev/null
+++ b/test_scripts/auto_test.py
@@ -0,0 +1,186 @@
+import os 
+import xlrd
+import xlwt
+# 设置环境变量
+os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib"
+os.environ['NCCL_PROTO'] = "Simple"
+os.environ['NCCL_ALGO'] = "RING"
+# test
+# f = os.popen("./nccl/run.sh")
+# print(f.readlines())
+# 设置超参数
+# run
+DATE="221222"
+runNcclTest = False # 运行nccl测试
+collectNcclResult  = True  # 统计nccl测试结果，写入xls
+runOfcclTest = False# 运行ofccl测试
+collectOfcclResult = True # 统计ofccl测试结果，写入xls
+
+NCCL_ORDER="1"
+resultXlsName="result_"+DATA+"_"+NCCL_ORDER+".xls"
+n = 2
+m = 3 #nccl
+w = 2
+M = 3 #ofccl
+NUM_DEV = 4#设备的卡数，实验用到的卡数写在循环里
+
+# static 
+os.system("g++ ./nccl/static_nccl.cpp -o ./nccl/static_nccl.out")
+os.system("g++ ./nccl/static_time.cpp -o ./nccl/static_time.out")
+os.system("g++ ./ofccl/clear_static_ofccl_time.cpp -o ./ofccl/clear_static_ofccl_time.out")
+os.system("g++ ./ofccl/clear_static_ofccl.cpp -o ./ofccl/clear_static_ofccl.out")
+
+
+
+table = xlwt.Workbook()
+bwSheet = table.add_sheet('bw')
+tmSheet = table.add_sheet('time')
+cnt  = 0
+for MY_NUM_DEV in [2,4]:
+
+    if 'CUDA_VISIBLE_DEVICES' in os.environ:
+        del os.environ['CUDA_VISIBLE_DEVICES']
+    if MY_NUM_DEV == 4 and NUM_DEV == 8:
+        os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,4,5"
+    # nccl
+    # 创建存放实验结果的文件夹
+    NCCL_RES_DIR ="./nccl/test_result_"+DATE+"_"+NCCL_ORDER+"_"+str(MY_NUM_DEV)+"cards"
+    if not os.path.exists(NCCL_RES_DIR):
+        os.makedirs(NCCL_RES_DIR)
+    # 统计结果    
+    NCCL_OUTPUT_BW_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards.txt"  
+    NCCL_OUTPUT_TIME_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards_time.txt"   
+    
+
+    if runNcclTest == True:
+
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_OUTPUT_BW_PATH)
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_OUTPUT_TIME_PATH)
+
+        for iter in [1,2,3]:
+            NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+            
+            os.system("echo $(date +%F%n%T)>> "+NCCL_RES_PATH)
+            for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]:
+                os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RES_PATH)
+
+            os.system("./nccl/static_nccl.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) 
+            os.system("./nccl/static_time.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) 
+                   
+    if collectNcclResult == True :
+        # bus
+        bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡')
+
+        with open(NCCL_OUTPUT_BW_PATH) as f:
+            content = f.read()
+        bw = content.split()
+
+        axis_y =  ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]
+        for a in range(0,25):
+            bwSheet.write(2+a+cnt*30,0,axis_y[a])                 
+        #
+        for k in [0,1,2]:
+            bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k))
+            for i in range(0,25):
+                bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2])
+
+            bwSheet.write(1+cnt*30,1+15+k,'nccl-busbw'+str(k))
+            for i in range(0,25):
+                bwSheet.write(2+i+cnt*30,1+15+k,bw[i+k*50+25+2])
+        # avg
+        bwSheet.write(1+cnt*30, 4, 'avg-algbw')
+        bwSheet.write(1+cnt*30, 19, 'avg-busbw')
+        for i in range(0,25):
+            bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') )
+            bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3')) 
+        
+        # time  
+        with open(NCCL_OUTPUT_TIME_PATH) as f2:
+            content2 = f2.read()
+        times = content2.split()
+
+        tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡')
+        for a in range(0,25):
+            tmSheet.write(2+a+cnt*30,0,axis_y[a])
+        for k in [0,1,2]:
+            tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k))
+            for i in range(0,25):
+                tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2])
+        # avg 
+        tmSheet.write(1+cnt*30, 4, 'avg-nccl')
+        for i in range(0,25):
+            tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') )
+        
+
+    #OFCCL      
+    # 创建存放实验结果的文件夹
+    OFCCL_RES_DIR ="./ofccl/test_result_"+DATE+"_"+NCCL_ORDER+"_"+str(MY_NUM_DEV)+"cards"
+    if not os.path.exists(OFCCL_RES_DIR):
+        os.makedirs(OFCCL_RES_DIR)
+    # 统计结果    
+    OFCCL_OUTPUT_BW_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards.txt"  
+    OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt"  
+
+    if runOfcclTest == True: 
+        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH)
+        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_TIME_PATH)
+
+        for iter in [1,2,3]:
+            OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt"
+            
+            os.system("echo $(date +%F%n%T)>> "+OFCCL_RES_PATH)
+            for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]:
+                os.system("../build/ofccl_all_reduce_perf  -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ OFCCL_RES_PATH)
+
+            os.system("./ofccl/clear_static_ofccl.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) 
+            os.system("./ofccl/clear_static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) 
+
+    if collectOfcclResult == True:
+        
+        with open(OFCCL_OUTPUT_BW_PATH) as f2:
+            content2 = f2.read()
+        bw = content2.split()
+        #bus        
+        for k in [0,1,2]:
+            bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k))
+            for i in range(0,25):
+                bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2])
+
+            bwSheet.write(1+cnt*30,5+15+k,'ofccl-busbw'+str(k))
+            for i in range(0,25):
+                bwSheet.write(2+i+cnt*30,5+15+k,bw[i+k*50+25+2])
+        # avg
+        bwSheet.write(1+cnt*30, 4+4, 'avg-algbw')
+        bwSheet.write(1+cnt*30, 19+4, 'avg-busbw')
+        for i in range(0,25):
+            bwSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') )
+            bwSheet.write(2+i+cnt*30, 19+4, xlwt.Formula('SUM(U'+str(2+i+cnt*30+1)+',V'+str(2+i+cnt*30+1)+',W'+str(2+i+cnt*30+1)+')/3')) 
+        
+        # time  
+        with open(OFCCL_OUTPUT_TIME_PATH) as f2:
+            content2 = f2.read()
+        times = content2.split()
+
+        for k in [0,1,2]:
+            tmSheet.write(1+cnt*30,5+k,'OFccl-'+str(k))
+            for i in range(0,25):
+                tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2])
+        # avg 
+        tmSheet.write(1+cnt*30, 4+4, 'avg-OFCCL')
+        for i in range(0,25):
+            tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') )
+
+    if collectNcclResult and collectOfcclResult:
+        bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl')
+        bwSheet.write(1+cnt*30, 24, '(ofccl-nccl)/nccl')
+        tmSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl')
+        for i in range(0,25):
+            bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)) )
+            bwSheet.write(2+i+cnt*30, 24, xlwt.Formula('(X'+str(2+i+cnt*30+1)+'-T'+str(2+i+cnt*30+1)+')/T'+str(2+i+cnt*30+1) ))
+            tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ) )
+
+    cnt = cnt+1
+
+# 保存 excel
+if collectNcclResult or collectOfcclResult:
+    table.save(resultXlsName)
\ No newline at end of file
diff --git a/test_scripts/nccl/run.sh b/test_scripts/nccl/run.sh
deleted file mode 100755
index 8b92e0f..0000000
--- a/test_scripts/nccl/run.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
-export NCCL_PROTO=Simple
-export NCCL_ALGO=Ring
-
-export DATE=221221
-export NCCL_ORDER=1
-
-for MY_NUM_DEV in 2 4 8
-do
-    unset CUDA_VISIBLE_DEVICES
-    if [ $MY_NUM_DEV = 4 ]; then
-        export CUDA_VISIBLE_DEVICES=0,1,4,5
-    fi
-    export RES_DIR=test_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
-    if [ ! -d "$RES_DIR" ]; then 
-        mkdir $RES_DIR
-    fi
-
-    for n in 32
-    do
-        for w in  2 
-        do
-            for m in 1
-            do
-                for iter in 1 2 3 
-                do
-                export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
-                ## Time
-                echo $(date +%F%n%T)>> $RES_PATH
-                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G
-                    do
-                    ## Test
-                    /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH
-                    done
-                done 
-            done
-        done
-    done
-done
diff --git a/test_scripts/nccl/static_nccl.cpp b/test_scripts/nccl/static_nccl.cpp
index 3c8b2b9..f12519a 100644
--- a/test_scripts/nccl/static_nccl.cpp
+++ b/test_scripts/nccl/static_nccl.cpp
@@ -5,7 +5,7 @@ int main(int argc,char* argv[]){
     
     freopen(argv[1],"r",stdin);
     freopen(argv[2],"a",stdout);
-    cout << argv[1]<<" nccl : "<<endl;
+
     int ranks = *(argv[3]) - '0';
     string str;
     stringstream ss;
@@ -31,11 +31,10 @@ int main(int argc,char* argv[]){
             getline(cin,line);        
         
     }
-    cout<<"nccl test algbw:"<<endl;
+
     for(int i=0;i<a.size();i++)
         cout << a[i] <<endl;
-    cout <<"**********"<<endl;
-    cout<<"nccl test busbw:"<<endl;
+    cout <<endl;
     for(int i=0;i<b.size();i++)
         cout << b[i] <<endl;
     cout<<endl<<endl;
diff --git a/test_scripts/nccl/static_nccl.out b/test_scripts/nccl/static_nccl.out
deleted file mode 100755
index 8d848288d03512aa6b678c42f3722d18281ff96b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43920
zcmeHwe|%h3weOkyXakfq)D|OcVW0)ekH$=Xv?;Wbw3#w#lQ!fBR4ATKCevgjnF%vz
z+SCV9Zd&Y^MhYVEgy*fw)vD;LUiB75O=&UZevlh{i1C706>v&HYXGg@Lhg6%z4ke0
zPG(Y!!~MKJa@xr{d#$zCUVH7e*Zy&4&c5ASzdFa|($tl!eORNEdx6AMGhxq{_y(X_
ztJ2QKU$=IFHVx@Kfs^!V2|$(CNKPs&V0<nh<twI}G;p3oYYHk22~xgH*U=(LQBdPF
z<;y3MoBrT(i@zppoh#D{+VhF>s+r#bcNa>$g7$o5o8gU;KHQZGeO&JoLMhN#Pcj$f
zOLD#>=ToqV?Wv%$C+SF^DwgYnZqVp*i@%!Y%(sE_IbkjHD@bL~)%>#2KAG|wIbU9l
zl;ecExqb?&@~%Zbvdg=@*qmAumzSv>O1a(&y10N3b%olNRD7tbeQ{SP9PeG+TeW2I
zl8Ta8q@+x6kX}><HQzPsS~aLWNH`WJ+F~}RiGJ3%Z)hzkeE%nFqO*T-|LmXNx%tJ<
z%~CR`OyZ%7^0Nud^aZ#h9$Eiy<=o+#jh?4z7vZx2pGEU`&l?Ut-5tAcdQL~Hzi7?_
zGgbke_F4?#v(ixh3k9UZ&(6UA^$hrPGRUFYrITNufxiKUNXMVZ!2ih%{Qm?ar;`)Q
zpwA^4>UAsw{=y99zM4VL`!eWra|ZlxGRXf$2K}$j!2iJv_-8WU@5@kM@;f&^&Z{Yd
z{@=)er@rHM3PJj+4EUdAu;*Qne~va+tL~A)R9yQ!@E4_#|0LuP|K3|9u^4H(Ze=-*
z4mm@x5Aknjd=c|&hz~KNJ#hKu6cd(cy57+p3F|S#A2oDc)9YFq^!8vh*b#~u!Dvgv
zs;)>l*y3;N3QAgf$_>p%S*flYozcivy(<{*Fgo>MG#ZIEt_^59s9mwTK(pcT1Umgu
z-H7@_1`yt6KmK@|KqznX$3g);5;LMfe|KGr-cqW2o0sZMYu7a!OLaZa+w1Xo$|M(<
zLg9|O<_f*JQg89-UN6eX%$Kp?RUYpap<P8V+}@Q=!3lI*0%J|G(Dl&Vk-5rVGd9BM
z^)3&D!ctIGC=$~lBUn)v4jHwwV(XeCT`_Ot>ZayMvraaGp<zqeN^Smj-QU#}3BUv}
zuU~INn}_v)-w1SqwY{rR+Gc4-(7;d<OlO{P?FXlEv|r|CNohX4qqkS@2}Wa)u)iy0
zY}2=tIxLjh_7mBrX|1%c@VfC%m#$3%E%3HVU3zPHYj--MN^cRSiUx%ZC#Rv=IF?`k
zU9Hdgn~jP<ByMoy!GX=DYc`_quD4CLwGcu&0s%cn5yYeEt>LX96yFq##k+$PUjiz$
zAgTmz)&re4>zn<dE)8*q{8gCHZv=Zo2BD}Do1?)<Pf*b|hsX&&j)<d^G!9PPeq_F8
z)hfNbq*S}6zHa5J4I6M@QBt-<!)?vVI^9!JZl~DqOUgVlsh5>hn0I9*m58CanDyj~
zzj^qZi}^|}<}!qp|8lhH0-rDVXy#PFbDccE9AN1qFBfJ!wMeLOqVbY#XA%F8nZX54
z3dKZk#AnON`!Qlo(}p<D|NFl;qkw$vIZhjw-h-hbPx}q0ZxQ^t+Uusg9L>)((qMKd
zbhZY4PcbfjUg!*(nN8#Qx|+|c@kU!nxioEr=j#gp#v+M7>c9^!koW=~R}{ZWABjjh
z&H3E&R{R~d;aL@dAF<&-VB<e(!}AhHP?I*iT4NF6gbhF6Cg-FLf4&W0#QIZP6x;A>
zJSH7!&Y&(ez7pO(KCAJS@N629MUB&hzffT0pCZ<0I+9ejx{7UhID~n*ZFmgn=C#O%
zXIBukQXAgB?x?ciscdys+wiI_iRQE6;XvlqXv3drVl{1p4G)JmuT3`m3=^wqoi;oi
z*1URbcwV9kTCWX1)8y8)?KV91DRu3#;m=V>z&$p+eSZCb4gWqH|6UuO)<x>tXT#4?
zNWlFz{JA##0UQ4PHvFIsf1V9LbvtE&DGN+lV9Ekh7MQZYlm(_NFlB)$3rtyH$^ugs
zn6iMTy)pCrzxn!JEAaK_|8hOHD|QYVIb*}VzQYBFgn7m)|8t$DjgDRXJiyEg1b-({
zMvwhwY;0@~<?o~on9*a;m}%O089jE;Ow)$T=&^5^Y1(KRJ@y4NO`AKT$L==MwBa&(
z><%+c8!e;9ZZXre!7_TxZ>DKuW%Ss!W|}rsMvv8)Y1&8`J?1gfw1F~u>_Rh58z-a3
z&N0)pVKREmWu|9}^q<~Q^`#9G)Za|g#t7<frfEY2^*7VB5rX=gY1#lm{mnFOe4zej
znl?O8f0c&y=L_rC%JxFKwSij0N;f#tUPpR`BVFl8FLtCacBJP!(sLZ?vmNPtNBZsh
zRDIuaq+fTWUv{K_<w*bBk^ZS8{e&a^Lr40%j`TMi>3^}OJ+Id#p82RR@mpWtOD7sz
z>Id?tVXM<OaQ0o;f$|NkIEdyNn{yAcyt#*xxbY3-f1kjzlg1o$tuWoAYn?H6bmj%y
z!LA)*2>!bTf90nLT>6eLal&`-4_EsRp3L>Re&T!P9b+~Gcv(Qf*wM|R-Bo_-f7@5Q
z4)@UFm$&-*R*bI2tuOJiF~irt;u2&_KKl09Sh5|?{FD4Az~{ONdCl@E|FNwg(oHM!
zHT&*(&4&)0_~B#7=SzHdKR~;$Kkwp2La;aK8T9qfnN14y?<AtHZxG=FRj$;rlk7_j
zC9i?!g6Akyv3QCk&kE*QAb3&)Bz^*G97m?)b0l`4@_o!ZBs6&(c9>5h!I$_&@@IrJ
zH7JJkncvf9{%Ei-S(@1Sf)KFNO@%<AS7dS%CqF2YM+t!X5aN+<V5i2^0hz4k<kuv#
zOp=BnGOBTEY;0)fVbEX|t?!3)S3L8sQ6z?T9$|F9(4%i~+A@K<{Q{I?$+?i^Tmd<K
zg6cXxJhbyEQy|^tiyUsD2$lRtvdx(w^a)1S@e$v^X9am+r>x9#R3G|M3QDr_{Tt=`
z8A`I)H6c{6KZE)X?Ho2srn_8+u5psE{lBPWBBSI!QF)h$l4z=1u;~g77q0<dF_n~{
z<_g!D&9y)n>{%|Qe<$7f0;m9MK9qct?7r;K%zFlR3>s)z$uVql2xXrlCgCWLLw0fi
zD#@a9t)dPHJt@2E_<hjr6+94uZg*3LTg?nV^O?GRl8^(HPfETJ@wJ$IU$gjhAcuC2
zn025#SqEv(p=7?J4w8fFU^94uS%-HZJNe|_R2{Ciih2{sK2a3ap+A2ilLl7&JLUPg
znddAk&mbX{hYck?GO1MiB09Bc%c10FWO5UG`9jI;=Hy#{9UD79p6%;zNVfaRXNcZ!
zbRXYLal)54WGm-M@R}7IN?w^-&gH4)JO<|D(=8#Yz#SwcF+%RU4XPw(lRkGGrbu+-
zO@~x)!c3(oaoedx{AAy4r(E%}J8nBk9WZ_=n)f(0+k8=tIVIOnT`Nb(C;Ib?C<Vcd
zC}Yo)rm!DT)*sW{CHW8npskd|Z6|$+_z7RX_r!SYp7AJR^z|dac~4T1TlRQt8ljHo
z5yr+2?R_Mh_@K|{Q8e@sb0`YER69Gp=`{4iG(oZIBeF`hQ3PoNrQW?Z`(O#B*<-k$
zNVBIWqW3qNmbmnBUt$<TS-$9_WF3qYRQac={twtn9D(CaRN|M3H-VT!LVQvRDklX!
zXcH827)9CCG)yCBt}a4D8J<*$ttVLI7A3O3y1+Nk=$2|X7977w-VG)15gHxz^&^i}
zr&(~=lzOQ4?pNhvkXKf@$=3Fk&!K{72uMBn+kBsE*ycYh%Z5?4V~^T}$db17A-EDZ
z?@)Dt?Or_VbGS5;cAs>cj+JOl`uc7=p&9R&RztGCL0N2WhO$$%O_gPuPK5RuQh=?m
zJGi|`@|u|<dcqS>vSoM9?9>~v%bK~Iq?rqZ44^3n&^Z#KSRcs~eWl@~73uPm_q;VW
zX6szAv_n>srs-%X$~9ZY$Nc2Rlw1ekBbv>c&FJEtFy}smm%YA3>jQR?Vq)dp=Np)F
zEfx$vqO4Tkf5+;{m++G6`^YmA-XroBj+yWdi+iwrMwrFB-#5^Tj~CKK?T$$Fh|JPD
z$mq8u`Y6yxA>{zz160_zDOx9}n~nGm9!0l>e}ltC07{E)dn1hzLtZgFBws)R!}Tqa
zK9qcihC~`3NQ0r|e42{%m%>U^)Dd6emxTSMEd7A5&wIeFa?%XSmh*SSBRT?<iw|>7
zgL58_sB8nN*&Y$uoCWSnS738$&S=^K!<;ku2Dtl9xKLM$0I<bSa+VN4o=B~L$_eun
z5>~WTE8456$ai_`5#PbT<%3PJ{4-6EBnzxx#J2~02@xX(rRAv+V0i*4t+8kiZfGc$
z6Y-flJ`4Xi){j4OFda&|Mj^HDwgXyx0a&^HQl$u&cmob4!kDN;gPN>0kP|>1QHeTB
z!$Byn(63+ul>92n;Q>7PLn>1a+mGWmd6UrLaHoCDM%1zgn7-HCDFLOQqlnd)Z-s)0
zVzxj~WadZ>8QUD3W`H!~Kuv2+3-!81kT8N?2S-A4h}IfP(g0eQc*56z6VNaI*bQSQ
z|MC~Y9j;IOp+52Z<PT3^x-y?MA-=?mggXw7`b3gE;ORe+$qz5Ov1Vh<O}2h+scTIP
z5=Hj7N;>(#b_6;NLZ<`cBLPT5$!A4}Jft-K7n&wrddQdfPhbWyfXUf1)_Ul}V4n|t
z74J4Nr82XAOeEnK1Be0xfHFY*2(_Vp<^lrE&z4z1%*(t-qq7udEhz%bmO_PD28@{E
zAeViTn4i3!wCFqRvwMMT_VMu;t#8HR$0)R<iN(_(&~gZ}T`_EjCWw>9fJ4cDMdSJU
zi;;x04kaHJNsP6y6}kduAbp36Q}wZ&N{ne(r_<bmW&om_Y)1e-o-d-DRWwR2HA_yk
z!e$M__LdVv421)b`Yq+!Xs*zI8;ump)ksnHM|+)w$3OvVv{YUzYsqD)>Rm$u5oWpa
zRU$crVkr_qt7j;PCtgh6OayXJOD?Kl)jYL3#lW4~@f=;12F;=5qCce0J5vP0hF5t{
z7;}X+!MswcmZUHGBi^5MAAde&hN3cy?vJ5?*_Q*RnBAtc9QePF7KRN(xcZcnJ5}!l
z0vH7`5o+Qmlxm+#m+LehN;6;Nhm6$OLi_6}t?3MMLW6OE?w^GMW~@qW8ENaT*HUe5
zM!0d&kqvm2BZG9<5vP+LK<J|@7rj0yeP$?GOO;EY-~sf40qR$xC!GC;3=xhh--43t
ztCw`DrM3gBc#0ZsnfJuZ`)Q@0pR790y<W^XhLTC7t+to(n*=#~0{na^Ni$o-YgFMV
zC8@vKI>uK;5*-5*rNqH{Y~i6(e3_7ahuw(U>0>sB0SeeQGnd*`_=$JSC3`vf6`4vn
z;z5YER5u%xD~{0$RI9zoVrE0w${J@$iUn0D9yR6>EYUECVvm@uca?Y8SPqP^$<sWu
z;pp&Vk)$cypbQJX#D64*g%P|XZgk3|`}p@oCHf9m*&@aGQdNlqh%?p@aulM_kjo@$
zDCrh%WCo{dQT&V4I&+C4G7cr5lM0~h{Z#hxtH{*EwCotU19cXvpYXsxVUS9CUbW^O
zUflD%V-4M=Dh_Lrznp}%5U_+KK#UP$wn7s&w6NJ`Pm?0lrNMEpnhlxG{{(gvQe$F~
zDLOTJ9)cXq5N*xR?MTC`xr8i2QMM>AWRZLHW1XO!%{jmc&;A0bsY3ow<YZ+H#t?Wk
zX<nX$p5q5Oryby^L&@ueD6#{`LgBm3z}jl+R6^wZU#1d5r!Goj%kJJmXUV1uo8C_H
z7NNS3gMpF8GxpXa2t%@BKy$@z!4Sh0yW#h;Vku*oW!ak!Z7!L4!44>5KA)<&zJ6fE
z8y-!oxvnnpTup0T;;ovNn#8GA-@xM6fNrk8^ld0pkQ{pnr5t=a*SNs*JnzTXCq^;#
zt4;jAW^DGaeSL>qzGc6Pzlz!w+;|gB@@hU>qaWIA6-iA-4KJLRo}-yh&V)u&f2J%j
zWq~OR{C{f!7k<;CE_H`NT41sDfnVHMzrm@-{dRoB4=#wA7;v4I-^palLsKFWLtg=)
z$Ntmtp)$|FhaTgfh0hFp#AFqJqMBcGhXa8w_myr<KqDA4-2SePwyjrcmzkH1))tQu
zZA&m<M50$-sW<5OZ5{n$PY;D-!Kk6R=H_0t3-xJ4IrDx!Huf0qA38cV_7dPDfTsZG
z{dR2ZVZi*~jg7qtrN4+#sRxSvkwzy#3?^f<fj=Lx0?-540=Nm#0Nf6EFW_Fl?*RT7
za2_iD2f*ck=V8*b5%5aDTLE_i?ge}n@GxKtW;rhb-U^rpW9<c82>5NlTEL$Hwgcv2
z?z{tVHsHg6>i`b}ZUTG>@Y8^KF!WP^3jyb05?>401V}%yZv)%`cpYXIj{pV%p8~ue
z@HN1LfHTlY&jMZnxCSfp4S-#My?`6BBk~a7KETHSUjuvrZ~>-XrvPsSoQDSfG~jZ;
zR{%Ew7GaIC9dIe&UO)qI5b$$=M*)ujl2g0`=muQ#=Ga&@;C8@GfL{XK4)|Tby@1aG
z4g!7{`>{s>8v)6o?gMlKz5`ebIPcG>H{i8^I{<qC9|qh7co^_=fG+_SVFp?NXM7T{
z6mTIPJT(F~0`>qJfO`NFfcpWT0UQP_#74>qz_oxy@RN@MmI8hauo3Vn!0!Mq$29X4
zp!#i$>*GzDtGCECcgFOBJ?LPB$J21F2mbjXK&^JV*t%bc+X{R(yny;sM=UH_U0A$k
z=2=?{wrf|PcjaZ}3oimz<lhLm|5wn15PXqb1D}1s_fUSmsI1-iyoM<f?Q?71!lFBK
zR-HYq0SX~W^hfadE$H<17L&flqCW-n&p{XZs{ETR`fH%S1iJWrn4({2(Pu!PiRf3H
zhE8g^KHgX_EIIwbi!ax-xyqhoH~O3bd;TBvtGt<T5~YK$FE@90o=}A9c`wR&aDsY#
z2lTIj4*8ahHme@bg8sb>^>iZ(*^%hi0_J@I)>V4s_=FJRq2(Ic!bd!J*m!QY%2|&1
zRPgfH*zMr4WNb0(u@Ur6&_8IWf7Hr<E9gC-lU>aGH(K<EKsP`azxPr))>-t&K;H~{
zruKY+>1B5LAuInW(0h@8rk%dpqR&IzB>k7!={**GIq3g1fqol7C;dz8`GZ#eTRHy<
zJKeD84}rcH`DvUm?UQqjU?4wv4D?~pm)PmuR{j@2e+Be(^&^{q2lamr^fv+XzJy?_
zZ0v<UzLH~>vk?CJMT{Hi<d7cpyV<)z&t&Iz(7y!wWp+LNmYzF6{|4yk+IbQ3Y$Q8?
z9>;y&0-K(71L!t=`8feg)-Ytf3f>FtWo?jjH*hCFe+~2#b~@^t^IJ>CY>Zn6Fb>w+
zd2&_>2C8ob=r4g@Yo|j7c|OrwKrg`fxYSM$Tk;LiF97``c6v^uDgR#3H-Mhb9&Y6M
z4(PNCvB937>R^k7BjEW2crw}iB<Q<9&(t@IVdlF)Uv4iWX4Rnz^hZFy#7@7-qHh45
z<_ej7zZdjD&@=h&1E4<#dZsow0D2DQ6qWY!yR7m@KraS;ft`MXML!ApT+q|mnA)%y
zbBD7*&lFp#K%W76$Syx;lPG}l(;Vfx3FOn<#RvLod;Z&`es16%06h$Phn;@S?%ck-
zoa>2;%Aonqx4|QRAFcXst5rvu2ki$vlZ|M8^uz@DY2I`g^kTaX>n-^-pLz`RbT)J&
z&qmO{3Hla$e)7{!DPtk<WJe8iz()b{nh}UpJ&5n?R$2QX>u&JfXqT1KBpApRM?ilH
z^egOiTU<zjJ_0(GX_jFdYYH$NzX19iJH6g2e-Y@<fbO%?q2tHFLgo8F|10R3VtFU%
z=VH%3g^v6Y&c6%va?sP+lH~6L{Sweq`UCQRg7Y5%y$SS8`X@oZ?D?@Vd{5S@e@?Ba
zpc|+H%sH<I9bZYa=rSY`Mx$7F1<gxQMvxGz3ybc`sVgl0bgsA1J&@-uT(mpCwy^Zh
zX}-d$zUkK#R)-6#Y6?qh3Ky*`bgwKdURhYQvamqBKf!>UD#$?yGB130?%cH>!xJ!9
z{_6$WH*@}XK3Y6+23SB5AKX27v1{&CH=v<iqpn}&xsK;9f`jK@_)Na`yL@nB+9NLH
zC|Hee1P3YigCB6s{80W@*WIo&@wJ_AxNLl8;ZrwL7MQZYlm(_NFlB)$3rtyH$^ugs
zn6ki>1*R-8Wr5RKKx`=B`xsb?(xu+Br`DoQNPDGpDLlOckggdLXqsEzoXu(9Rq{rx
z%W<pT12~uI>e-5V2bp^BAMF{?g>4UUz4gvmgl_g?FO@DVgT*EGUvP)*Fma7A9ZPg^
z&0<D8QWBTg3&Ne)XJ8xw&ZeCVOC)x3#I=ZT=(!JFwB19OdY9UPLYY?hDz3QFTRdX|
z3$`!BrOFq3Uce|md{JD~4|WwOe6d%gA7I1cQLacUJ@NQZq}MPVkHtiqn}bWs9#HR7
zcS<aEOH4JxMuwXh_AuPea1X=14EHk}WH`+5D8myB)yoWvF4V;3W?0IwnqecuO$>V&
zZfCfMp_DTDzv{h{N&;RGdZl|&Yg;^Q#NEsAcF@wrp14SPK2cU$Qd&{sxm4m+j?6F@
zy{lL?w2h744qBBKpR1|&(A(JI(|OvhY5DWbew4~TO;i0R6+d0>bJ~%(&xNZh|Fe^F
z;qA(AQ}JhNs(+;7&(c&sO2xyu-FA$;&&6p9?zH$qZEu?P&DCaV`_tl!%yS#;rR3rb
z)+%nL;<39~ZO6#_T<v|DdN047nv18%dsf*oihqt)ye=L7Tr)n}skw+YDlVqti?ww9
zGFSV68OKuj5oLSq7<r#-dmp%+nrogWVPRdk&C~ugW?|$Vg`phmjy5hxrBHYz{B{SP
znuG9ayivL6nUf3p9EeH`l}8snXK<ylLpSgwN4-y$c<I`Pk4qbG&j)}{r~f{|KT9j9
z#tkl2?jdnMOFN^Quoiwq+|Sa^tR}35e_7nm)gE=||E9pt(q>fS2A7J{^bKDZ-nd^>
zEzoAUiy42817FSfSq^*?<7YEo$-jm1#SZ-4jGyPg|10C&4m>@RCp{NB@PB3eA_u-0
z<0kQ6;lR_coe5v+z+cb!3I{&U_$mkfUdAtX;D5mQY6t#zjIVXz&&rkh_#F6h#@9RW
zH!{A_f$w8{iv#~<#&2-o|AX-x9r(X7ev<=#F&dQY*6zTsV|=FrA7gx%1OE`?dmMQB
z-6P329C&)(O!!_0{#@}KVwQHR1HXdt+Z}j%mO*lMIPm?9-{ru6i}AZ1_+K!7j{{E=
zK$3s21AhtQA8_F78UK(2zlHI89r!OW{$U4xi1GUvujX^F15a`3%ZoH|iRV_hn~#Yi
z;njRw&5O~E(#9nZ_+1!BX0MZS?0?h6v8%;Ibo_WU>mn+5{CL_V<c}XmZWMCP9XIZD
zF<#B%l|Hu%IpfEj&jNoD>KiJRoMoKhZQ#?LLvaoaNc^8$A{jfF-(dVz6%w(W@y`K2
zi|_4VJto{F=lshgRy+sC?OlxDacL_4CB{D|zLcbCO3(T;rJV0EUdh=Fe35pZR_qv0
zKg0Z^ET6M!e`b6O*K0K^yB7l8kmDSeuRlwcyUrsSFJ%6wfIo+7TrCmkp5l6*`P&`!
z`Y@V2oqgJYC;MzKm-!XXFEjA}MeuWBHt;^IcT`>*TBGkeVpP-4SSYddoiV!n8Th{n
zJn1vLO7cA_fmU89@gtW>#O;h906ty0G><$d=RD0hKYxPdJSx6SV7+7g?3r9H`_%_o
zew6V}`~M#JbmdlH3PSqlh%e3Hd#x<zPT<q=4`<-N=o~4h?Mf-2p5@#N{Dr8OdVf6J
zUR?WuPbZ&{q|d-I$>(GK>lnX>?WyobQ1Mwg<LA8(Gk+!f)rVNl_kqX%ao3snawC=7
z!+v{$IXW5d?2r2xKT;|g<IFz~3vQCr%k6Rr<D-oK3b&)u;UMGJF`j`|Ge^c5^&Wj?
zw-(?@zI(A0%-vG^7UP}qVcNOr>+8;dzZZCt?~Es>82>@;cOT(;wP8bn_&?0;axLTk
zjq&$fAra#H7q~6KMuMt4vkI^hc#`ul2Oh=qFyajLNB08Bt=jP`%>Q%t6D5D%2hz*;
zXTTd7@b_fEe+78b^Yh%kC0vODghImK!2W>lCN7=v&iM8?@T!i?D!_}tQyg%{=T}+I
zi89H}-BnwI4Jj&jGy4O5r;x5+Gu|1`S7L*T_;c9NR6l);@lUY+O1F#WOa5YR$7NEy
z)(^bmVUz&x0H5w0r8x*JRBk)VQFgnT@hvQv!>;xN#$U(vQubU5e;_%%%n!F0*Dl5%
za`gAl15e|fbDZixoG-$9Z7|9L7<^s$TPluHZ?Sx5{8{Ce`aJ9yzwQP;oqfK}{OhlT
zP+W><76wYH*Hi3QOC(vlA9&K!=?`CGe&_i9JK&W}79l_Z=26s-hPm7z6N`bTa%Zsq
z=;q>bGk%2qB*ggJ7=I7PfenoRGvjx&V=5iI=(i;QLG~*S6WSi&6%UIL;E@dYpR$}y
zWs+O@?FDcclE0D-reyA7{4o1lBNsBl_|LPRTNuCMVkzgFY#-%UcQd|~`PKM1%J@fE
zpIVl4Cjvg{Q^4gazx@^PZsxIrzcRnU@)6d=wSmSpJ0sus08jFr{`MomU!+y@zI8tH
zKh6Bk_`G0|l(U1|7v>e$F2>JdJE*w&8^$~1TNV6<^x4318^ww1Q;dI={Y14p#iMlR
zvK4@V<ZNd-XeM!82Yfm^#F_u6Rg&*U=6@b|H{aXAtIY3aJ9D+Q8&MvWdlB@a3&Xm&
z+JSfTy&Zgl`4@5g;bzu;!T2UdD4nlHn&dnE;XdFMk6peNh#D~?j$fVzH0xYB-RRZ>
zbh2CwXFRn>^p37bo4-qM#}A!ix<B5l1tQ%&T|pz*UQ$)M1ZPa8W72U>R7m$nqyBAr
zFl<D(;TS1@cTjJScXw|?7F!Cxm<kywtT>v=z=3o0ORSPWPY;grYBuUU%X&h=KyYg)
z7SscgFb*?|2XMlcAeD=sUiqOR<a9{nl2AuD62&=H#8IW|p$L_O<ho@zS1%l=W9#b6
zJkUP4*&pvRbo_YhX1yoU6$)&V=dsm$N<j-o1%*rUpeni%v7<`;?U5dtouuPXvq~Jq
z7wnZ<`ol)7zA_ZR@oywGEqO^e5{tLRGSaIcKh%vww=!}p4Me)SaBNpb22ZJXRYOg4
zAR6j1GBQ<)gYGhttAf1-&N2(OXXGgNZ@oDqv7#dy?8!*23{FV)l<Cwq+rp9VkiRRV
z6i<0Kosp;7^t628h@rPdPA`x6g71>AEwc4pl^dLZZ<(xyKP+4^t<!iaBoC<;j(1H^
zhb58`oyNeivypgaNAoaiPc*ni_iw?uaG7gTu{jcj;M1GBlo?|aIF3h23rEmxdM2p#
zQkJ$k6pa~rB-}LtZ@D<;5&}=tH$2Q|Z0nhz2bD>#U@YLDh>Z(v!|8g_;O5Ns<3ih_
zU_32Xk3Sj=pO%YcO;CXfSpgg_c^XDmE$R<<OdyWAVm%WUT4@%F-XHX54nCeG+zMiJ
z!TEy|G^{F`jxc1aAi}o?4R|aD3cVR)$`%|`*iOe28bLiILRJNWJr6B36d;a-K8}F~
zC#?E$L}ELB@=HSsrAtF0O~0mo{mPnpWQn)wb-Fm=P}f&C)iii@@48w#k5$J3%evRc
zNMCJ}rq_OCT}?yXD$r6!wf0m5w7^e<RSD}?uWt6@RMnc5^<F-E)U(7I(fCAGoadSO
zyv~eAZhF058pT*z5a(+y(Hn7^W;dViSY0hnYsA5ebljpig%puC8pL2ohV$1oPNq;4
z0(DW0*Nd}aTl8i$NtND&<6wCREYoYr+>vcGp{T2GEA@IYDvKtFY@1w}UhfuBkf_A}
z#KEX+I(5(`CPEn9C)r4~W#Y733iM_Ab>c_m;>6fVHMuf9AEBFFp*W<L^$TGZsJHtK
zzgf;CE5AxqK1iZW_sEtFEOU~&N>ghF=qV2ex5~42Y5F)RdwZ+Q?$zD6mQLoiO)=?E
zU(7?hF@p}_&}LD{<ocAcM80jHj(gV<^e!)Igd=sAqF>dmM{vU#u6i5;?R#uT^O)Oo
zM;52_3sPjQ&D`&;H7e+oW$H=JLzAnkZKu#msyfKl8388|?_?uX=qWX)Lv1OZPVW5b
z6j8D@=@9RD*c`cnI8+!jB^f+D<%%Yeh$H3Qku4bc{vn_9B5<n^zDjRi$>Cc$;v`H(
z$BK(_VOdW+)`=L00UQooNs=cxr`%x<y*(DuJ27h0$>+#drG}Hq_1`UxEuv;NKg~v1
zNEvt=(QC!g)wV+3+s=Tn%Aw&OI0`+*@Y1DeJjsf4K|vTrIyX#_?il%u^BRgk@2PGc
zrJJq2vf&iV%)JYW<7@Ci6ekC)?OBhxl2&_t36>2d*rGVyq3eIw!Q?dZIqbDWbRAEr
zzDn(;Ose;+5#8BV1F^}GjS#_;qGqGKI~1<#hBsq^S4RIGKJ#~3B9rH=Z8itke<ZG9
z!qI?J)q~he>cWx@6M}%yc9L@=9%=0piub?^5ht8}NaJ>PEK=3&@6}_pdqq<t1P?63
z{O#@0U@R8dJSms+qPx&SMu;oJdS(2uh?PsW!W3p#WF5JXsiCz@69zG?WxTY83Wd2_
znM){mPd2>YQ;B6P=5pp@?7dhah?%#TEMc{WHMg@a%|=(OE@KdGA`^=}Lp_4kT+rWL
zhh=Q3?uE&k)@JTS%@sO=23gSSh)Hr0mCZ#^y7h{=*vd*fdC*R3!R)2AHI}-x+LaCp
zD_Cm~nx0O|<JGLmwoc6A4uO44s?^tr`528#Vza;;ne-dlyBbpm5zzrFXrfuyl)1Sx
z`j6MUp3Ji(8WeL?jC(<Ba|MF%B5#G*j9U*witq`p?Cpp+>MI7Y@wFAc5A{5OPJdMA
zU6_CuTPWkaJ^+Ow1+hNGpP_kpYqt}%t~t^b^ER$-YK}DPsT}|hNrI7{V7ic!&H7vG
zRcc}9tTY?r3#N8r9h_C?xB3yS<z-i(4LuCCfzF%t&Di|2;#UP0fh`TI)K;B%@RGf)
z6^vmGYS$p>6&B1=)L%O8Rf2U^hqO-v(k&h+_$)<KrMaZ)H(9K-DN5|6$R@_@3<Get
z|K^}B7o3~1-4W~!1o24BTzpO@Hc&5fEMY}Q#rPtZYt05d3)t$9w%1KG*i00CFpWqb
zfDm>#0&(;)tNss%a4gjuu+wR78OhyPnixgj-B24EH`Hb}@5Cc5HYWm+xEl23My3eq
z)_Qt!TUFTjrOjB(0%)d$2?1Nt*`S#fpP<!xdQ@!5%J5B33-AoBGaA`y4dZaWrJ=Ba
zXOh^r6Du}6$E|CtZ}WIXT`)Bs=be+POuL#D<H2RGV~ry=S1G{Jvw}&k;Zw(Ja{-?X
z%c-SFb^@Q=niW&gXi&69>Yg3?huPgHWobmjOr0IGdpX^ZX3mCjJecCr_aZV(3@7$4
zy)72AwfiKOj?$v!H&vlXOgv7gsKaxwaa#cxDZ4^#0X-0li>Fqw0;VmEJb-7#8Z<{X
zmw{rni9#@RR5;ge<HnHp;t3S2k1-3Tz;`=DJfBpQ!y`LHI8l#(FkW)7<ZWww#X`r!
zw?JoV2S~Fal*&LYosGt-YzuaT!ZOBEoXg_Sw+usHN6^q)der1863u2XFNf`fJg5!V
zQk_rt>z>IyRK)aHExqLWKjlGY)<yyIxrLafW^tus?Y*c@XPZKHhbCzh)6LjCV(-Yi
zu31m_uuJX;cq-dNn|XPsi=`bD^6Je#F-^_#9;+Og>RM=?g|A{^!W4{REkNICYw@><
zX9g(`rLwW&u`G?a*&5y&LN9I##^T*UtVvd_(#uOqvp2fP<&_n+>C1LfdKP=D8sD?r
zI7`)2emIZZ`4%BGc+2FR(c1<`8aElE^|*Xmm1XVKpo%iH5o;^VeNhDOY%cLFE7O-S
z)J~4Dba}wsvoxptSsFS*y|fmVektay;%hV3qTKwB30lPr&XKl{;R_Ggu!RzT?_(g^
zbx{GD9zirBN;iqGndlwe_#%%~LJrcNs`1;z+1w<y?-0Y#v{}}Sk|_*j2tv(9IQZUp
zJ)G%(WZ2Gf@3OWxQ>UyHic+`Hw367iZo}UOXhbFKRCly+i}q+GSk(nf@J-*6o+vie
zqQ*8Gsx2PsYF`{`XH?C~y2XaSLlf+s{#d70(!MQ>A|y1TlCmWjjp6%YDG41k__Cie
z@Lf-rp_R~sdaVSvB^?pIi3I~%3HI5M6cM{58WCT5DG7G+7ot1cAx))FOdz21UJB}n
zflEl0-<6g!)IAFKcZVQe3d90JE5Y})yYb!Yly7`bDuJK(XfAwmIJbK5hJxxnG)g$}
z+AsRP|8#s*{;FI_RnRSy=3n*uGlig^TiEk&;QR`z@9!)AJaxmc&XHfe??XYigWs9|
zdeCuXLqzGX-mjw||17|Kzm8LXKk}c2kIFy7`YX7J(^N;YoyxC%pAiC%SX6%XJ|6|u
z`+LA5E~ou(0hZp4qw=fwktpcneA5^zJ_SDkJiTj2rPX^&6x_@CNPp5(>Cb3w05@b)
zm0!KzM8WN>uqt2WSM|T2^RHxq>OCk5s`n33{tw{e%>M=8s0bxrb0a~QTl~c@o>DJo
z`#%J-J-^#6`4v?9D=9^Mqu^JNw&z#xUr|u;Wy-(bkzc*1MM3)+8ze)?Q}9toe)aoC
z1us^Hrp&lhKAFO~GPtqpuipEj;Mt0iq0@d(aDLVPS~W8=%p>8pOSPXu{S-IEqVlWv
z$SAm55jcah{O29{)%#`?+;t{rWZ*3SWk-JXUK$0RA=X*`UyvU^trmTQics$b7~V*!
z@%C2bE0~MQ({H;tOqlN*ILi4IpAd#WDy`s5<R{-(`PKVylB#^RAN60lR6gYgUk8rt
zukx$+f1Ug&af6?TwoBz#uo!fEe)IQ{n`9=3`<dk{k>?{b$x`K4_i(;V5^i_2f3-xL
ze;081r6xw+6mOCcKc`B$NLRY-&{2e-`!)%nZQ>vB(eL$ie9U@FvUsnV(w$KP?2F6P
g57ZC4n5fECy!iRLS$Bma82F-0(U~>&%aPRn8}@h(vH$=8

diff --git a/test_scripts/nccl/static_nccl.sh b/test_scripts/nccl/static_nccl.sh
deleted file mode 100755
index 9cf7ad2..0000000
--- a/test_scripts/nccl/static_nccl.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-g++ static_nccl.cpp -o static_nccl.out
-g++ static_time.cpp -o static_time.out
-
-export DATE=221221
-export NCCL_ORDER=1
-
-for cards in 2 4 8
-do
-  export RES_DIR="test_result_${DATE}_${NCCL_ORDER}_"$cards"cards"
-  export OUTPUT_BW_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards.txt" 
-  export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_nccl_"$cards"cards_time.txt" 
-  echo  $(date +%F%n%T)>>$OUTPUT_BW_PATH
-  echo  $(date +%F%n%T)>>$OUTPUT_TIME_PATH
-  for n in 32
-  do
-    for w in  2 
-    do
-    for m in 1
-      do
-        for iter in 1 2 3    
-        do
-          export INPUT_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
-          ./static_nccl.out $INPUT_PATH $OUTPUT_BW_PATH   $cards 
-          ./static_time.out $INPUT_PATH $OUTPUT_TIME_PATH   $cards 
-        done 
-      done
-    done
-  done 
-done
\ No newline at end of file
diff --git a/test_scripts/nccl/static_time.cpp b/test_scripts/nccl/static_time.cpp
index 444446b..4a29f77 100644
--- a/test_scripts/nccl/static_time.cpp
+++ b/test_scripts/nccl/static_time.cpp
@@ -5,7 +5,7 @@ int main(int argc,char* argv[]){
     
     freopen(argv[1],"r",stdin);
     freopen(argv[2],"a",stdout);
-    cout << argv[1]<<" nccl : "<<endl;
+
     int ranks = *(argv[3]) - '0';
     string str;
     stringstream ss;
@@ -28,10 +28,10 @@ int main(int argc,char* argv[]){
             getline(cin,line);        
         
     }
-    cout<<"nccl test time:"<<endl;
+
     for(int i=0;i<a.size();i++)
         cout << a[i] <<endl;
-    cout <<"**********"<<endl;
+
     
     cout<<endl<<endl;
 }
\ No newline at end of file
diff --git a/test_scripts/nccl/static_time.out b/test_scripts/nccl/static_time.out
deleted file mode 100755
index 5c1066ab78bfff9c4b76d8baaa707a3427a121d8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43920
zcmeHwe|%KcweOkyfT);63O0&AM)3zK#!N^;P^k%IV4?}6`GHi?!(=ifQ<IrEa{{4H
zD!xGL7}BDbT0SkWrPXV>SKDjvwLY{|gP`HQ*V@?URqBtHw*HuiwlUQfTQv8(_Fns(
zGbb~_rm_FL9GI-L*IIk+wbx#I?H_06?AyIH%Q9RpO<kGVXEaK==SoZk6Sm*SHvkn{
zxppf4y0vq)X-H=YoTOJs0IIwWXC%WM#%BUjz5=>Q0nd_XO+lq0LCTlzI+8Cb3Tm9D
zeAz^D(;r-J@z;b~&y;Be?fFD`70mB|x8+K_g7$o5o53}bKG>cNeO&Khp%iGWCz*@#
zB{*M#^C_6m_Eb>WlXRp{Im>lIH)wRZ#a~Tx=3CAAoUn@d6{IrgYIxmfnMiqcobSB~
zAx3k;ZCpPERe7I9KC;Wlz1W;uJ(riR9g4W#3X-4Dbwzuqxvb=h_Lj@rL*cIO%e%|V
zE-x!7j718I1)cPwGN}2MuV~Vsc0b`*oM?+#oF@AF@1B!)<2Q1Dbl~Mxn;L!heQ@(j
z4|<dgDwBBVqWo+EGkq@Zh)34{hZ%RcW})Y4+WGj*#b@5x+s+ycKHCwycX~!^lRtm<
z&KXOAPI+Ap;WJZE{)z%p;b*1ce=rUHv@~+4cB$moq~WiHAyV<j)9`;e4gddykyFWu
zrP1f2H1!%vgFi1#xo@S>^WHT2+>!?Wn>6xYOr!rbY4|TrgMTg!{@yh8CBJjy<GkwA
z=>PpRc<MWDrx2vCN`wDJ8hd^f@=w#|Xce7On2KxP0{;9I^8XWZh=2F3l30K=UAM8E
zI)|JA*oXKxGd`dBHN=OQ(Gs}qGKvXhny$BYM8bN^@J9_@*YxVfTD>J04Yr12Mljk~
zyR<zL4mSFm+k=vpnsP&fQCy_!Mq4zpL2nO+Ta7k77>!1vb=L(n9n|(%b)doUcmi$y
zsBT34Ap;0+gCBpq^&l*0_Qyg2JrXmbL4QYequyAgdm9$&_1CRvFc#{1pu5}S@f1rg
zFonXc)eR+jL#f{A(Y;=jk)AJY!Am{f^+LOnV7R3{m4f5wRt95Du+Wvz+>yE5UNbhr
ziS=F(2!*Af@=zqELq@QqIvg^pWW`oAMA~ECx@GkZkp`V?1Vh7?vXz?sExNzGJraNk
zU|zr8j5ZJJ0lyJw18YlrowUuu)}Vo*B$&!PW7-c+<7mJ1&63o7dTV#L-WiO>B4K}f
z$k?c_FLGEYx$VcZP5pJ!zQXIqI$f$Z4K%{rN_FY2;SC+Bj4HiFm?|0+Hk_D-X5(0X
z{WrBf=Wj4d0+B9*8xIa_HeG`e{dm1?qOFAx(i#ZpF^V7_O>YWs2%-4;V63YnNbx10
zLJOiw;1)g5c8k8wA8OYScgSCb3H?T}J7f@wDzPpajC2MSZC!|*;0uU2I!WW;)Zs_w
z<x7|93kr+0<u%nymabln`;x-qG7YztOR9BG;Q~9weqUJZkx9L{u*AG8E-Xb1&BUxH
zTl~$!-%QL`GBKASto)auO&9oV!ACQv9G>fB0cHS8A9=Yj<EcVIjT3bjZ9IkepJWCX
zI4Kkpy$+xC$G(mcYnnE|dH&zO-GTzLwHG*TTyhVFhAizjoW51?XKL@5@-j3((@29^
zq0p%s^gYhFt}{a?(adZb&)3y_UX3@}1(ZwE4)c6n;pfeh_#+N{|6GaB;c-RrtMs9W
zq|=<wEpNr&K^vY`5%|M4{24a>BR2dT8$Myft2Gvphi&+?ZT!b<_;YOdeAb`ZqQHh%
z<1y(-a|U&(@s;rQ@mY<pglE%;ENYx4{CNT+|Kzhi(~+dI)m31_!y(MeZNp<oH?Mg%
zJiCIR71{9ibw{}kPi3pC!iHCENi?4g4+k=@Ivf6E6RT;fZFo4md9AhKXP8(`YqR0u
zu;$fi!}AhV(7J8-Jd<0~Hrw#jr_^<)4S$+K0&chA?epuMHvFe-{JU*<S{JEnj}1Rt
zApsw@;ZL{W_uBBEw&DA2cwWles8hF77MQZYlm(_NFlB)$3rtyH$^ugsn6ki>1*R-8
zWr6>{E%3*@bN=G%c{|6~oBhj`*sj>xZ)A)P`g#uL91w;XEq!T)rj3kV_%cA=xf=1e
z5oKiPH>0DY+eMl-U`B?XGt;#3GBUK^Ow)$T$j}eXG;Oqu41LE;)8@{|(A{R5He5!A
z?l9A|(K0f0tC^+^mXRU9nWl}Ek)h9;Y1&X38LBkXw2?A0<T2B<fig05o|&eNlaZm*
z%rtG7j10NVbe>4R`;n?IZIGb;W|}reP=7N`8zQK`nWl{p)Za|g1_<hJrfK5?^*7VB
z;eq<AG^~HNuzr;$?2mL)EwzS~u63lnj`U(jy3~=r+>yS}kv`jzp6y7V>PTlh(jV3&
z>-)YV{f;C3x+DFnBmG~F^v@jWryc2^IMP3Iq`&V-|Fb>qd8a!5+|9oDZ+$(l4c9f+
z^ktV}>&e%5>XT?FU*F>WXs*%O_aMuA+bM|~Utjj)1Qs1LW}|C`=^kC{q|qaJ=WYhO
zc7P%H?-u-}cM-VcBVT;jxBvIo`1T*m^tqn#J@=6@3j(|>AZPT*I??VbKlQ)Oi{F8J
zXkC{z`Fa+QT!&j<{B>i7uXpi9$d-8Q!_m=13!M3x?5Dx!x(RvB@+tq&1`z3{3Hcg)
zcf9RG2af+_AM*L)KYAFT#n+p4;XEPOoAC7edS}lfg?hIV(bv<D@PR6q>exy4#Rn40
z;kn>B0#z)Ye95zzdFBe9BmwbfV2z{5lz4%}_LY8$c?X0hPr(jnlSuHzUrhXhkfsI&
zke>IbmiI(|PogNk^%WsttD6deLT||AT25XplSc@E`U}J-U*A@ZsRv}Tf|LI$nPrkR
z43SZd<D;VkTMvN-t7ttxp}T^-uTsz)*!ojO_X<6F`ll@tsN2s)DVCh`NX~r7=@C@d
z(ZPYOgQh^b%Mv-<LJ=zY4`iE@LFf^TuA_&2eP0vgzOAw{FHn8T5MKwjK5vSmyG%#n
z_mil9`zwheGDP)=+PH>_;0&<+ZtS9>$Tio3Z#k6o#mWBvtJ<`Gy|BPxE~R%X-T4C0
zh*cR#6haT*q62yN^l#}m&{UFR(Bu%x(zjr!Mf(=N3)zW%P)Qb5Y!&sU(37&ej@}F1
zW=Y-pDZ_(i2A7rLQ9|}+|G}qZe~nmoo2)M|0dCYwNYyL@iQ|6}NsSx-KJ)}r-+{yq
znOw`%KS*ZTKm&<>B=?d#_<Cy-ExrXaM3*r-j;^E5<%=J%m9rzcoU4+{nV($Fm%)5=
zx+O#vxRQj#4^x|0p&E%_Q~HjB)O~Nf>456d7~b>an~ul3j`eIh?&>POW79G6+OA7r
z^GB(96GE5SA1tT3mfi;xZ1*yyAb1Xl+yB!P_5@{pk_P<5E;x#<l=!A&zIfNLuh%;~
z7Q1~ciWq&phzj0g6d4vh6`Mw=qgjNx>42D7P(UK}eI%S1q0qNbHWW%ruRj6(Ak?;1
zNQjntqX<$5T8&l5ZcMv2dteHs**@Hlr`faP(R=GmQ(W?tFFuF?AX{_<G7sV_RsLD3
z|4v(phvA6hmG}Vh#?f&}h)+sE<)ol*+609hMo~654O7RNEAr7&h9_BK(=dx%uSE7%
z<oNpP+*0kjoTKN<yMe?#LZkh@UgWXrG!q`1R1ekOy{cRcjmj!^->{qx6+}Zo>i*wm
z`&@$#e?w6=jH;bbU?6cPLQu-m2jEKFyaN?9>6A@9P$8a>IeZ#PyHC1J$BH*4d_9|n
zHRIFLYDo6hDvQlYQ+ATJ$+Aq-iTF7~3b6U7gWH=RugMc(7M_5TExU7Or``x&)_8x6
z#(N0qLsRsjbHqilK9VQ;O6@T#)MY2u{AqO5iWby&9kLQMxkW=!u30ibW+yI6%C#3h
zqS>t305NSa=N`nD-M)C!PP<4k9r5n*_02vB)9NQEE7kY^&>$Z7lInZNGveMu@)nL6
z_YR7Cu>F%Ti}zt)UlTrFNEfv`B+-XumZpA2zc0~8fIb2#djWS+VgF9yI!@i}uy6kn
zbX)j0I9vpvwCJ`s(wH&e72`wV6(lew-YV$>iH~UPr160?7)YFr1(C0}2v(w^4*BB0
zB<v4l>3e-W-o0*>^I&q$-w}`K2vDwTkaHTG^Jqk6>r2k|kjUmNa8IfN8<KNIQx+KJ
zoQXezyJy&ix>5vyEd~-Zg#hwIY6VnIm?xL8qOF?HUiJCD%bE`P_WvatY>MUEGz*a|
zuzo(@?)Ak*jOdq^r$&I~381vP{O!1*u~<%D^R|2q{xQ^xKXNo3Nc`UsNbT9QSL>P!
zR&Kv!DZ(ZG2!|43OjM#)&4_Br380RsM75=1KNMH!H)w2*{ZY=b(aec2<DSRuZMaQb
zOzED3ZT3MMk<A`pdfs*?1(aTnB356%4GJQP*#bennIk!5Y;<sPu(kEL%BIRj>UHxV
z;V^m~90|=KT5BLd188;pX<zS6K>zAVH;kG1)1L`<uqOWdn)sWEFTacV%Gsm|@x^~d
zxVOMj6Hkx_>=Y(HIPb>FHI+BnO10FrW(M(mdt4=*d|*2So%*5E-m#GYq=CfqqC*}~
z8vpYLu<s=YeDRln>Bj&jr=(aRp$~)o&0)~Fb{mNoU6GTY7fJX<AEH1XpbQXCpf=Rc
zTtJ}tnRziWFY+FV&QzG|ND*Mx=PJx1V8k2;x$LvV?8JGn8oa<~_X64MW8*Pe--^Zi
zD72)A#e=Ap<q%}M0@w~s&_x;p4kZ2qjpyqvKoZV6khotYG1kIX=n9yD^c*Zm*2i)x
zF{WX4OLGgF0f=t083Ficwuo|8(I~mdEIHl;o7E25TTToy6b?Y@wUle7xkB$I8Y!+;
zBgOGQ*y|)b1`1fCrSe)?OD;=QZ#fARZ7V8YE|LQ%mLd_fdX9p4{8x!vh|mKEwdA4-
zR?U;UQw-e69naBKY0w-<jQl=%-kBs2HoVF^Y|IhX1oIN9T7teFk9dE~ee~s|8H&m*
zx<7^nX1^LR$?P_r<-q?qS{ODE;p#3Yce37b1TYF>BGhyZE7iV{D%S}-lxDuj4;jg`
zg+kaQN#7J2j0tq#77EzsEVf!oTbI3)Y-=;ZjfswIz-&hb>9B{LPP!MNkFHGg`h@fu
zEWN05aTMH#UeHJVO7w&o6x3-bPOkD*D9OHhNu^q{6<EdN)NqTu!+Bq)<#cu;<8AKs
zV#a|<2-21<W&9>V&Yl22A4t&57V#QYI6_J4ueOfyFCvMKfr(Ole+@Ri&?z1uWY0l2
zqIT+-&0&B7w$02%b`_rSj=E$oC%+<72}j%y(U$6FSL2Ffv;x&?Z?c%#5Vo?$SdwBv
zIf_S(c?65s_M_NCX6s$$9W<^6M%d(8p4o6{d{QK73fC{gf-nB_#Go*O_plqCGT}b@
zxTr+W!E#%q7+b0;u@7;^8bXdh6dH1oL=7a|!i{L)fmsF;6{7fG(d`_fh>Qb?7o-9x
z`(ffddKH<Pm@?lYcc9Kf^%EZWE(WQ@UxAlA2NHQl!6WwzX3=f3;;<I^%Q0AM7-K6*
z0>l_W!*!e{Y-nM#&7LJim}7^U4VljWbn>o&*`#7UiEbjARw#J@axg=*H9xl_^&N8w
znNK;!wl#Pxq;r51p8XwCQ-%Cz<Yc89V+cH&>h@s*fnw|+=d^=pkySzz*@5*JzS{t-
zE&4l^5IO&csf5s}os!rW+tKeV*>qvk+ezL$dl!Nsgts0-7?Kr(Os-fb!YNnm0<Kun
z7-m_P_JB7-4KWR_yuPMy@quDZt6Wtbf1$FeI{toTV`cn!ldtdceLy$VT=F5*$VvSC
zb;#fUVWx4e=Vi`c6Cc5(uPXj#<>;(m`+5$ze2ZS~dJFZ+x$!2N-c{aQsUKLkt}yT1
zEmWK+kD6S#Bt3UBpEL>b)SoE}Oj%&c0#g?F2U);{-?XSp-C?>Ru-L@GFK(>g;8fs#
zGd|)67sN~qxK7CLWU}R<$sLKIuK>_v|LOQpnJ3{xkMU2zX9hlE+KoR^&9Aw`fk3<a
zO1CDU5sVpbBh(SRQoGc=wDrM&5s6-TrCzJ!mvr>|JUtYS1)~OJx#nbEbtm+%!~OhU
zkB;uc{X<7aM_&Wn2Y4KC{%=P|?+2XyyV242pyZoxjE;6fk&-ttVF0WJoCSOUumo^D
zU?boIfCk_*fI9%+0DJ^68{_d=sAvh`)qraO*8pw?ybbUdfV%<D92y-x2>1=a*8raa
z%z|;=0K5S37+@7(0j6~=fY$<U0lWe5e!v}o2LT@dd=2n<z$_ScCgyt=05$_w0qz8(
zAJp#w+yb}@Q-_BD9|Zg<;H!Xd15U?WX$BhQOu+eo+W}Vt{yShd;CHddunX|_fcpT8
zv5D~tU<mLy;8TESp+TPqyc%#mCX;IcR{?GY+yuBA@NvL?z$1W10CTZ=A}6Q<bOUY&
ztN?rla4p~v;AX(T0`3Mp^F8DT+yZz6@LoW2pjQChfK~6K-hj=3Er4GK+yeM0;QfHl
z0v-fB0{9x>D(pz-!1?kphb{vAJYXH*y?~v7j{|N8{5jymfS<-QwL!o}z+u3<0rTM}
z_X8FI4g=N!&cGJYBY?L79tTvvg>ikMUUPNlyXMT8p0gbtitu>)t!;*XUID08O&1Rx
zF2HRGK5eg{{?zeu^OxloT$^{whMdjXHD_LV>4FQ+2Ug@?1K9s6^dJOZB-g-aFYuj|
zpD!wF8$O=bMn`FfUi0SWe<@?>sncqq5RycH2%n2V7r*OK^lL5pPeDHubfK@Jue0cH
zgMKOK;`d;RzRIG{fIj2V=bwNM@tW%kbv440)4y4InWoKA_9VN}=M31Bp7&?v!AX=3
zzMjm?{h2}$s^<=r^BeU0boF=y^kL8;-;&X6)#G{4kAt47o^E6zI}-h~fLY&xb(J0&
zJ|Tp7XrW2A@Da})HlEw9a;`>vS^=Kh!DGo-Z`NZC=sy7cVmtk2EB|evKMFe8#ms-B
zMc)Pb<Dk>mnM``MMc)VdL!hUsC)x28reg_V<qui;kAwaM^5@y<%Pjg?XeZKtk)7UY
z(XR&m&^Y?70iE<OwC4|6`Dw?T@-Md24U4`D^fy4Kal)*B#&W?x_1g#f=~xn#+36it
z{#QVs4|=Nlk<A}L{m%mZJ;1E{5Nwqnc;Sz4WSHe#kcAy~j3cS!kRJ3q*%v`iXXh5s
zhd{s7uBYG9a|`Hy1U*$d&qJOyWCzf@aGy2Tre}3u=C-V!?2LdVYY?)EFqWNXFKe}=
zyMY@9y%_XiJ011S_^l;l7RIfQz*A%A$yh2FsJ<njUrOVkoemk~`9yC7eFf+X?ewrE
z-vB)b`seKQj5<^P4$!{^dMbOkk>?T6w}Zago}cPqi-m{5a}Ye~Y<>*%XF*TbHw$3q
z{{sDLdl@mS4&|VafPRskev?IC4LZ#g()oTj=+iJaNawpdK|c%hbZxK~bRX!Y_VU}U
z@(+W4Gw5^e^cyVtG0^>>r?N4%VFBh24WOrsE#;usgC4TW&sZx8p!_sP*)@)Qn!9`j
z^kw$^o1}hj;C6!k2<WYL`tog=Jy{vo6Bm_1^POYh5x<94eYeS~Bh7>U1bR9f(fsHX
z%)ip*r+L%KpcmM6SZT?p`BVnzsch&*o;9Gqhw|6k^OK*pNf{RaPj*}e8IR#Us{w&X
z)r0sRw9488SucX`M!T$xdci=pI0X7k>_E@A(`|7f0s1+hQ<-KNwy`D$!}0l`&$iQR
ztn%l9{%O#Cb~<$Y0$8YgAL!SBo-UTRfxgB`NB#)szZ3M0pr^7W$=?HdJLpOM0r|hk
z`454<6ZCZYCqR$9G&+j!#ai{xs1g-)1C@h0=PuC2Z}(Lj$dE)iG>Uce!BdAaf`nL>
zoB!2}>fC~_WO{SmeOcbzdE2t9a*Mt+&6iu=Gktk(ML4&-GPkHQcixg*_mbR#CAs-a
za&yG{5)8=ch8%<-^TJo_PQUK+cy{N?{_b?`2N}<1zB)~NVmeqr5g*)r^Fr60t8UP=
zpSng|uV%T9X3oP8wX@HAE?fIuHaIcu5f^ggEW<Z|gOvN5XSnjN$ll<(+jTO&aP)nb
zjn6E6>SoFUQx=%Az?221EHGt(DGN+lV9Ekh7MQZYlm(_N@V8n(Y%t)R$XJ@vrQWlr
z)}~HKd#7|MJiP;ut{D<&np@tS%IWSa<&9XU<5s-~a1PVevlje9O<d}|f3$}{7q&sf
z_5Mer5xUupy;ZufEEX5-$IylCF>xJcI+pC>n#qiK#3U}UH-tN}@4z?$oJ~6xmPqXE
zh-)6-&~qTVxHwI{OKop%GQONEuJjhqoWO$Z3~{OQ#U2(giVt6T7xjZ(1qxqL7U`XA
zSUk!VX{9F~AByy~OvmFfk>=*$(k2h6_o+K27P%#+f?*xQwG2BMZf3Zh;ckWxGwf$L
z$nXfmVTS5uhWY1d;&L-AVpzejj^SE{oeVcK+|E!+nfPDz-bp0^F9^NTJ+G;`D{OSR
z7vb%oMVEWJM9TBU;-bQ$l0wfV60dTkhneVI1*)NKZ1i@}@|5^YO}&TS#txs((r!!1
zpKbP|Wd3QI>OaZ&>0;m0j-&fbxT^9$J1rC5uKYF`f3l|fM>75tP4%N>Je=EY$H@Ck
zoTlJTiO<z`r)b|yEl+zmB|hIgx4~XYCf;DJ;#M*qyPXwwjJ(g(K4rd_-%icMQ{~M|
z?HI*BTgzFI3V*s8AMMmkL>m<slko*ws(zWNongkYWPU{1PCG{4XWHHeZl`9Nr%6~?
z7jCn(cSkLZyrVFbf!)%&xycj?kA&asz*BP&UX3>@7d?Y=L7%-*iJ|i7qUR2-6n5wU
zp5&<a$r3MJ8}V^zW9_*U_*DAu5&Sc?oC@6FQso{H_cOJVDhO-g4~zSm+Q}7!weYWt
z`#IWU4*lN~_?g;_3f$mQahkrX?7|!O^D6|}EcbH8pXR_<Fn*>3U(fhij92n+Wqg4H
ze>daLa^U}i@oop6p3#$@7dY@AFn*o`Ux0Cw_~$$DS2DiHfxn*dB@TQS<I5d*`eixE
zzuJNSG2<&7_}?+U%7H&6Q_Ar<@Cz7U<G|m@_&Nu^hw+UL`~!?%?ZE$>@oOCTzc7BS
z1AieJl<M2!z^`C@n*$$Xe7gg`i}9TfJpF=`<Qon=J%1*Aw*!B=crGzhyUl@L%=pa?
zJU!bWIa?g~UdG?)!2gi(+Z^~88Nc0urwJg*-{HVt#Q2>Kd=2AwIq>TlzuSTT4&(24
z;0GALhw*AY_YUwBmtMYD6PI|7g}bvcfh4?|Z>xDRx>3rw<N?16<4D#DDaZadRUErU
zOi;&;M>Ef-a>tIR^+NvGapXoJ=kzh-PCMh(JYMN@yO1+>-1!>t=cB&gzCv;qbA}Ir
zPjwE(X)qx14~mDtn%2tv2IFsCAQ4wH{srJ?^1U6b#DtsVG+rXH;<-3(zsmT>=OyD`
zWBd&9B_(|4f`!$bEakj?X)^yd;PbUJwF1X@`cKS1kIQ9o+MgJI7uRbU%ij&f+>qlO
zm#@cyh4kN2BpJ_R{+|MW8r8T$BG5g>^)mB+!%?r#pvhC&rv-SD|0q{f@%%Ck|DOdv
z7iI&W!g@#LwV^fo?juGu?W7ANmcBzqmp={vzW`7ATr0kWqG^vwc5Okf#OJd8Z)bcT
z@TtnBdE{vsXKK#*`O_@t4<%B7@`qFNxLo$Di&=h@@lN}{34E$@OE3i?{maXwfOA;R
zmw->jKbVI9{L`eIor|P^8kVyI`14RN_5OIcy|^9*K9zhzl0FB@C7+M^S26w>_D_XB
zgo@A17(4I1pZPz}e#PBbdmMQDA9J0IFFR7X53t`3Ge;Zao&9kS<8zt6i}}yOf}7<0
zfZOFF#zz_dDz~H3VL#)yFrK@aRykY78TB50Ww%D)N&c$&QZP50_Cv-y<HNMmQ`gs>
z2EPM%lJATs#~FW9u`Ks<T(4$qC=kEF?eba1|8K@0;y@z4uK`*iHWF0bnN@%#z>}Pp
zxW6c#`{D4^AKh~$w`#}lG5;y-Z%Y1IXQY<zPlGqo;O|L;{~qw9=kwgYg<OdoghIk^
zX9dyS#HBOd8Q-1)Ue%FV1^5;46bGE~`7M@n5!;``jdm?Iq^R6o><{!EL%M#=cxOCc
zf(<I-FK2&K{d6DW|H}F+-7Y*^@~_}_TqMP7y}&CTMhWl{@TtyGnvK9h<$i<ZD7)Rl
z_`A5jbJ*2>%=m3wFJ;e#@CTCf1LlX@i|bCtf8glv-vXY-JLfpnia4K-^;&<F1u%F}
z_**iLQtz{TXZ%^}mioNJ<$_IIcLSfwJ`Vy<`fp`JDxR4bC<&jz?NTPiXkQ1O^mO{e
zea!D1-+u?ZlF1?j$iX~{`jK{}#0HsI06djj$@-(4i_6XU+)|kiF@6){4;4$qYR3PG
z@lUbhN(V3cEy;h5{ffJ{wjFrI!y*KDC=LE+EazU2<W_!rE*ysBce9>K=ADe6#{N^s
zg&b!5^Q`B3#xK55%6X03Mfug;jPGTBH9n3o{<o}870dY&0zT>U8TNDKx32>4W*$5E
zfcgKG<s+<#Yc-8)c1FJM1fJwO{p|_CpRaXt`(`u$v&`>|&vVh<B<FGVH<(vkcQW3`
zc2IHkH;i}2w{paNlJgaASQIC&yBP20IH}rw1o%|vvgLq*<UGoZXeM#3LOPWlx|lzc
z<AC_SB67S8yjx~X`ui61uVOp1yJ$C}ov7UF+5a)Di>n2AH{aXA7n%QNE|=X)dy(;X
z0@9^)z6J~=-{}wc0<U;v4*na68Zn~_zoZRl*12-J(V++EWVsm5cxs90t?iL!f4km-
zpFYKOe^<8_h;($e2aRA$VR=y*&X`KYq~n~ZknWF0{TuaQ*obb#F;f1Hpx)Be(XkO(
zY$^PPDr6+F;%F)Z2hP#&uL=X5ojA&?!Km>p>I?+~!408UP!B}HILxdofD^U^X@U4L
zmLD2IPOC(gg<8XrD9))Oj&fZOMW`GkS1iK0df_fQwyvhw1MP$B{9Wyaj-PYgqIX8x
zLxGL*JhmE75op1vpm0eZR7E!;c2tqSCDJLglXM(vR*Hl8g56R}f7poCl!gL0{*9!j
zB$tIFv99J=T6#I;hdOZRR$7jQfk=Bhj_pdz;3@Jht*vYbL_?iMTBcHQ&|O+`d9d5S
zS!ThOv>Xfk8*WKUENP7fJJV82gX5At#X7ak#&Dz~<Zn+a#j~J;&d5`3dP2T%#L$}~
zCzeNi!H>z;9NF-($_<Xgw@6mQ9~Q2d(rG*;l800aceRgGhcd~CPGjKM*+^G<NAoai
zXEeB8_pisfaOrDOvMv&Z;1ip=h#6z!IF3h23rEmxI>)K?LYB5J6pa~rB-}m@?*ehm
zB?O+JZ+Mu`*w{Hv4=R>i!C1gQ9vc_hjMMd^!FB2F$AvaW!FWQhPJc8QJ|P#$8m9s!
zvI00<@&t^mTGStI9Y-8<#X83=wA3sVy+7zrAACGz+zMiJ!TE#ZG^{F`jxc1aAi}o<
z4R|aD3cUei%6c4A*h0q>8bLiILRJZaJr6B36d;a-zJP%SC#?E$L}Ckm@=HSsrHeu#
zO<!KKa!F+kvUD};)w($0P}i5$SJrxU?}{oqk5$J3%evRcNMBXGrdNG#MP+UEQqYn{
zwU%TAw7`#sRS7GXEo<=NRMpBQHC{e@)Kg}SXnZ0o&ht!vUT4}PH@#jjjbbb<i1Rhe
z^g5iT*}<nfR#b@78gcL<9k(b>Aw{H(1~C|t;r!Kg6DbshKwT8$_2R79M!f+|Qm)tI
zI9MJ6i}flpcVr_?D5`6ki@aWp%AyG(8z)w#*SlU6Br5TbI2e^prw+QrL<pn%1RJTU
zSe$lCfxcK@C4OcuPK=#UlPlBn5xSEr6o=HZej&^P^%lS3H_Mq|<(G=e2T7Fa9+PDQ
z%bcXH($ty(dKQF(8{}ELG<}?qy}jjT_v)y-j!x#aO)=?EU(7>0FoO=^&}LD{#QKy`
zCf`3$&AqD(y~~Rl;Yi(u=vUP%5!`TwtKNly_7k?FdCYC9Ba4&z1u1f!&D@`?HA?7|
zW$H=JLz62iY^TslsyfKl8388{?_?vC=t(uFLv1OZPVD^ZBvG<8=@9R(usL!Cai}n6
zN-}tQ7ATrTB91KRh^)uR_jmc67lB)a@TGdg5)R+W5hq|OI#yha3yV6tVr__V7{KAc
zr6hTrbIKj&&|6{=y$z!_oqUdb<!U&YSpQvL*C=Xc^V7))3rPcS9eS-ey4qIAC)*hi
zRyj2M9Y>)j8D6?HjVF`hTu=~3k<JZMq&h}E?YxE}&?l;!N9ks3ugP$VMdscG#qn$L
zK@=wktm<5exsq0OeIb?&h1jAv(V^>q)4}94@;U5PCc2KNNMEXUQzq1Vt`*(cRs*rg
zF&QC(Cq)g$f{swQx&z*f30^V%clgZTWQla1b6tZu!2UgP4HJ%9oT?tgUQ#=jY?u%P
zgtil$8}UeMpHO@PUWhp1^g|lAC&wb?9sX`TM!Q!uHA3*fGR)u75)H;;k#!StIWM{k
zEo6kaVysuj4vSd1Ojek}?24=-7c!N!mZ`@ehP8~B)=;4^cPn!V1@D;*@As5q8H>4`
zxfuH-RtRF|EhbA??P1OBtV@H@9;;3pgzL$~V$V>IU^N%?cT{5;Tcmqovij@N_o9Xp
z9YKRE=yk*-xrmy~MNq2sin-XDly>r<oz#NaOKWQ^b!oLL9TZlu)*v)Josh??S(9y@
zIEgz1_A#kczgEo0XjBrL1?I@4-_X)tmpq7w4p>4H&FcE}&7IbNyxx^$p0a3A%vCY&
z1+mQ)2*QiJC1NvfB@8LTC%7hWN5oNIF@TM&t?+%Q=Lxj=qdM=x1iaWn8RPW<C=4lx
z^-2B=&BGfyoTwEIk@lFkZdrXpq(M*a0C-3ejC2N5g_OyxzqMYa7Iw}`Cu4lU)K08}
zC)N3_ene||*%fF*4?}IB?G}9<Hvg>nRf0ueW9?G4RVN<2Oy1TC#;{toYY_Ab3r<qh
zUpVIFg4I@sv`+(4EgmTNEJReLxuohhlUQj}l-Ns=O^n$Y2H+0=EkRu_IM-pjBiJ1X
z;*prS_?$>=pkC%!!itWH@kK7z8Vq_Cu)!Z~sUB~z887-^8j(5xA?$Dj;^<>m{qGLp
zSgO}zr_<arlDn}qF^Yb?p*A#Ts7-I)@kd&0P6Q%dYS5P(nIfcH>*<MYRbk_oHe)dh
zpqUaT1Z+iTgQiz}oL1}XRIz1JhHrXWfM;lJ(Z~jC7>DyM422Cmlf=HASh3+bZgq1_
zv&Spyf~oNs@0?I&TGXr<4=%eMYaFq;N&$|Z6-;mqpFCch3;4;foLZVpPT-SUvtlY5
z4T{!C-m^phFuVJNERBemuCrryFQ*&Q%#&dp52m>Ey@)gu!}0w~Z;r)m?LNV!qqHdb
zO?fC16OR*0s`1=w%vJzK%JxumKo7*a#8WF+0n?T`9>6EX8Z<{Xmw{rni9#@RlsMOJ
zW5$qA;t3S2k1-3Tz;`=DJfBdM!y`LHI8l$kGhTAA<ZWww#X`r!w?JER2S~9Yl*~Xa
zJsFKv+8k^Rg=LJTI5&wy-y#fstwBR?>{OGdNOUrT`2yHZ$b;H&E!FvSzwVjXLq$xF
z)zV9@|C1hMPTDA7KDQ9l)Ja_FSbHz3)7hqw9ia&t#dI?^kJvl%u4vFxJ?xS@0-n;A
z&^lh;>0)UIg}i!$PfSxMd5=|&Ox2Av&%#%+FkuQtu@<23v^Dyh#WRDXhf<TV;;}4^
zxY-om5JE4m55~GWf>@I*U8*lAESkL0MJ}(bs7+tClhP-#x2o}dk{f5Cddd&ykvrca
zga&V&m@|5t;Yed9W3(QZPpc+bdo`${%*lwgCFZ^;g7;)D@hvOUmoU^$j<EECfVpRB
zPWdNk=m>SwT3Gs}n74|r%~*?a^E)PJ6*D+Tnm>;(JYd5XO8l*lfoRu71!#H%QHLm9
zFTQ4?w|3x*JW>fcNPEi1ZWB-DCb4~o7>1_JvIdk)VJJ-yYB0jVPsZ!vO#hH!dy;#X
zwY`}<Wu;J*yp5(6#x`~s{$@ZUDq)+tqlH_vQ!B)(E?9_f`WALZv8fg{Hri0lU7_}t
z%R?=Us$5cix#4fs1bdr5)}|G<Yz(6a35}?vtPe(G_<mSYLI(}L?57NT*V%4ph4i3a
zE5vPKYlLrN!GKnXeRd>8#4e0R#Ft(QgKhkU=(ZL}Qz;Y^2<W_*f;wX05>n-NrKJpY
zkHY;OA&8d(v4GGD@jdMheD^x(8{ZR3(0gcH_+)Ty_1+By)q7}^aN@OJ^nL&7_^ABl
znUboYTPDrFIp&?bp|8=~^RMRo3aan#t87{7hGDfMzk1(?f^G-DGynCV<LHKn(qFw_
zM?wC2zWII~r~ZEAKLsC^|1j&X;95>o9m#eozxsVf2smO<`PKV;6jbl;0gJet_P-Ta
zdN+>Buii(ZppWxSW2pEP{37u5t{s(D?=4YqH|Hb$Nl&FeqqRQVkWE#7^?nltH?zX3
ze3f6-|LdH82@6#3K~Ye>e~9v*fsZr)cYvcJlzh#N1YK_N7r%H)zMSp93uJqKw_EZn
zsPtD-@XIH0Dfm64?D^IER}@ryoDIL=ke7lFJMyddv?yplbAxiKd<s71$gh6is9+O*
zKWV>IK81S{H+KEidtVg9&ytfar~RJh{Hp!63T9_G!bt^G`>FKLa6{#*{O?srih|n|
zL88s~hHz4Q88`O)>U}c`-g&a(l+aoJ>yG^Dy)+6sL#(s@e@1@%v|98HDnY#$U~mnk
ztoD*A%YT&~nNXg7+r?qReBZzk&ae2=<<CQY@_m(Gy$>g$%4hpg|D{XGQ-1IuaAZG~
zU%mh9*v-TZej?f~m0!UE(Czun-$$;MnN&YlB2+$=J_nfzr^>JB<a}!--0W!o3W+xV
z&gJrpOpLrKSSum_jLM{vu5{U<qX<Fwtrb99%Rk_w-|Ol4DE(EvP6AHpE_2Agd%7h2
dvHD>b6IHp2_cYw3${+Zy<e)Qb?3W{{{WnN`uU7y7

diff --git a/test_scripts/ofccl/clear_static_ofccl.cpp b/test_scripts/ofccl/clear_static_ofccl.cpp
index 3db10ba..2881772 100644
--- a/test_scripts/ofccl/clear_static_ofccl.cpp
+++ b/test_scripts/ofccl/clear_static_ofccl.cpp
@@ -5,7 +5,6 @@ int main(int argc,char* argv[]){
     
     freopen(argv[1],"r",stdin);
     freopen(argv[2],"a",stdout);
-    cout << argv[1]<<" ofccl : "<<endl;
     int ranks = *(argv[3]) - '0';
     string str;
     stringstream ss;
@@ -31,11 +30,9 @@ int main(int argc,char* argv[]){
             getline(cin,line);        
         
     }
-    cout<<"ofccl test algbw:"<<endl;
     for(int i=0;i<a.size();i++)
         cout << a[i] <<endl;
-    cout <<"**********"<<endl;
-    cout<<"ofccl test busbw:"<<endl;
+    cout << endl;
     for(int i=0;i<b.size();i++)
         cout << b[i] <<endl;
     cout<<endl<<endl;
diff --git a/test_scripts/ofccl/clear_static_ofccl.out b/test_scripts/ofccl/clear_static_ofccl.out
deleted file mode 100755
index c385526edf2c4259925a5b2a660e5092ab8d370b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43928
zcmeHwe|%KcweOkyfT);63O0(=QBdO#(o9GofZisMfe9v%=7*&U9ww6^nUc)JnF$1o
z6<?rr8uO&qYq`~0+S-=eAJ+Ev-iv)&gJ6Sw*2Z3Ixz$=~wH-uzG3qO|YVLRKz4keK
zPG*8l<Ndrpa$vH~UTf{O*Is+=wSSzMvv2m+t;}+{40UB2|6owcJx^k)n6T$-d;?Hr
zR2paEuiH4^n1*zYz)5<Q1fa_6NLDJ$V|+Fs<tw6_H1Ir$HWXAE5~O^YuA_yLqM*TP
z%9l$dH~qoo7Jn_+GDoHr)bok*s+iva@64BY1@(Mno8k46KHQlKeO&JoLMhN#Pcj$f
zOLD#>=Toqd?Wv%$C+SF^N|x({ZqVp*i@%29%(ss7IbjX+D@bL~)$~%lZ6f70aK5~1
zDaQ$K<N7J6%KIqtkzL;F#pW~`xx7s6P|EdI(8UFOxHHtcsQkm7Z3{X>;hx?Fy_Jg=
zEGjRFMM}y92kAv+Q1h)?(_%pFLBg>((H65fP4rJbGHaXn(~Iudd(A*i@y)%Po|^Tg
zl}ZMcNj!8>el~%XJ|B0)BkTYDtXo{O(DMxALVV`oQ#|*!^M->@cE#?Rp4HysFPyz+
z#tNX*UJD?6W*W+0Q9wHUtPK3$&VWBFgB+?|I{9@O`0HVabo_}7{GZ9d|G!}5baG-D
z^tm`gy^dwTUyz~Pmow;jR|b7HXTbk1gZy7*(ElSD_%F$Te<}n1t_<}hzjNc`yc#p;
z|J@9D>N{?y5Tuu7!2cqHJ#UBnvy3@LRkswT;@Z8yUzkSz<B&uAdvB1$BBbfs&T<+Y
za)w|Z;@`peLgqIRA7b&gz~z@yOju-?W_wp8Y{ugLXxuanv$nb3Yzsz%?V(sa7;Ua!
z(HRK`oBgewK}kzbxwa`@R%)8@j%Z}7*%=JC$2-hmG#ZIETpch>P&;F_fu^{}6X@_q
z&3M!wiUZ+o^5c)U5rl=U{#YnrMq=@3(BD<tY&Msg-loN7<JD`L;)_i)(A(?rc*-Of
zm_p(9+NN@|slsgbm|ic+$jq0q;1wS47NK2vFx=LePQh_>TLfcGu+X*8+>yCbuNfQR
z)Os%rgu+r#WhfFeAtP8`8xF;5WX0ArMLJ{NhLw#?ktUOD1Vh7?vXxr>ZKl7oGZKIa
zU|zr3iZ&0M0e?Ku0oJz825Fnc?ZG&Pl3+UXjA=hOjidcCH%m(MneDy3W_K_ei-i51
zq4+j)OR2*`sck==Z5pqZ_7z??*6Gr<X`mV2R$)qS4R7sAXH@Ae!c@_qu;Iisv>M0u
z>%Xb>Ie$~UJP_%LbK}8*t)^>=N8ej-n`mnxgtP|&W{e_;$1q#MTSF+mF&OLV3Q~Lt
zsL+C_64-18IyRe|{Gm<*afkd>n9v^&_J-nwqDpLv1|!`;McWi2C%6$2$0TVSoVxtT
zylTY?b74uTv8t|i`HFSxa9>_hw#dM3_3~QNQ?gK}==UXM9+@=DO3JOfvXTnK&}__l
za>d^q{LRLEB^z@Y!peVH#&m(t6?`;v%Hz3C4qz6r^pTefGoBhG)Hu;_@wPLF{}E<z
zfs;Zp(HrpDa`Fopv8EYAoaaBjw;2WG8qaV#e(4<;8gh)^ary?qpKZKq$;&eQOd}0u
zg+gZ<(D!Y|^_&|zjb>)kc)qUY^J=^?=2I@iIKuOFg@3(R;*UD;gYzUlkH;0ouhNGj
zl1_6zx4ad9hc!H_BJf8v{5cx`Q4P;a96?QLc(ukN!U+vOSCey6!+%J_7qb4;7DXCf
zjmM-T%^B3C##h4Y<FguH3D2exS=2a9_zMI^{wZXArXxvZtE))E!y&B8t>H1GTUW7$
zXIBukQVp-KJ1R9im94HS4X@ggXg&=O2ePgP4S%|YHH>u{9u99^8#Vk43u_o18XgX7
zUELa<m#Bi)tKkbQZo}B2;i*rl>sAeamO=vV*6{lLdXI+xfX2U9!_&G*UHdfrY=s1T
zK*OJ{;rDC!4{G>94S%kNpSqp0z?221EHGt(DGN+lV9Ekh7MQZYlm(_NFlB)$3rtzS
zFkUbC&|AK~SMq%Qx&OHq+ZDS8<5{D_zP>|w2Zeb?D}KGkFh)i%dKRGIe8JyAl#ye<
z8yy|pP5C=$17_sdQ&yTbUPg``u+p^QGIH$uR+=_iMvi^OO4H`f$g$5^Y1(iZId+Sc
zrj3@7V>ei7+F%(u=C{(cu`+V(qgI+WR7Q?fTWQ)z89C;$(zJmxa_j;tO&cd8$Ii0S
zv|%!G%w?quMEbRNRefoL1ogMlv@wGETWQ)5LH(^XZG@oyR+=_IP=6~;8y~2@m8K03
z)L*4x{kg*WHL|^sZmFl%u+#O9wAYco%8{;cq!&2S7dg^%9qHMQ^qG!yt|R?UU8=rs
zI?}H?(l0sEzj360=}7;~k$%jPe%O)zS4aB0j`Y{{wCB~@#8cP%5-<4rUOdszTsM$A
z4O^YQfirK%4wP@;ssm`Q(b;z(%Nx5Xi5uTQ?vDs8IT@dgt`(+xbgk1yj~1N21MJ2@
zhT#94;IH^BflJ@@B~JJb{OKdU11Gb6uAlgxdN)1`0=z6BZ}jLU(e5fg^}iigy$bg*
zdM<DA^<6b`HEw;0m*O*g{a0O#Y{>`T868cw!I^)O`xy9K8<5v3pYk8u3L@RKAYYU3
zmREe}z=?++LOx&OUmpNy^Y!OkR4fF0lb%6e|Lj?$Q2#C>`uYYDK2YTf6FbSi#87e-
zJQqAip^D8@D0!}8o_T^NML^;wu*Pv@N<Kqk2P!_myhB2hM`4G#BocgyUnPG*NK1nv
zNH6#|qu|GbeaX_quIGe+U2ZA_3jIMQH*)e4nLJ7W)O!$*d;_}-rVhwt6(_$XnPrkR
z43SaeZ;y@+?K%V+tYY*%Om{^Ew~rt(wCgaV`-L8TgQqVMsGHA6DYl#oNX`|I(<i8|
z<HJL{p0EVcU9QOC7K%{G|3$Vr9fUr?=sJGHH}H>wJg`ev<{7FFeJKSc+4=sR^8ErO
z+3bc8D%hVweTQ}pTP4$7wnNt*lCa~iR5FoKa-XQYOGHUDRTP`9&~U+O@D))>32Lrz
zt<_xfgu$NXQu=q%oiBh2u;xR_$I0$X4i?-oxN|U$mX#dC7Kc#w31SkC@+f2{2cVKH
zD%&n<ztEGiyN=%l-I9190^L4G8Ma#)e&Mrp`wSrmD*mtJ3lU$l#rG|n&jfO4*Ac4@
zbSLW|%{i3Jb<{y}P#rXb=UH`l7qXL&zoqK%5xb~2fb0`RQ62ho=QC;Gs-IGxUs`$2
zu=5NOQhC@=(j${fwXdR6TeciZeqJUwvX{@7%x+G;`Iphr{p8uc{`zE_Z{ZBl`{P~5
zH&LAMB@SxkJPux~f<wurspVXrTFygYK0e(Rq6*weLJ~*FeYZiC<Sf$XmO~VYuG?@>
z1t-i@3KKWIo#;8)chlRhp0ZnRI!PU{=TbE9acZ`?q8hVHR#9Cmj*w6E=N3{5f*VlA
z?#C@*Kc=jY(A*_?4+5Z8O5&!IzC_OnU%&UnSnTexC}Q;WBfxo2QjlBnXlxpxj^_|Y
zV~6%Wl1+Tj=Ux;IeZ(A!LNC=$r#GI0euyS0c6~%vsWyrrZJ^Y9*Jd9qp)`94_v2~y
z<aqS{2FntcKI%&hV<^iNeUz+&k%B7!B-MY9R^kyj-gqT`gLo5&DI~-vrJ!<B(A}D#
zki#g-rj}tEICE7Y8Y=EdmDqBEMQ%|d`>XPN0}XDec0=Cr3+3HV@(!WV0bf7z*mar-
zhfS%6YVUqkE(UpJl?__k+dhX1q9GvlzzexP*RbY4EK9?v+ObF7ipY|-^dYzsH}6nY
zo^~&u^*LM`N!=&imSZJalD@v1P8jhIN~<B+U#~1SCqvmO+NR2~OeaG73@JeC>ke*j
zlDwusL{E4EO1ACJnVotgcG)wRlQeUIkO4Hs06IrP6zd~-qOa7Sv?E<^@{Tu0N43rc
zOFLvGX_}6PqFl3Ne9TR*Psz0(K4NIrY(f|BfI0UeyzKQQTK4E7#l*_H&o?l;84HFV
zQ&y_)PqBLPCA_5iKJtu&_prQ$V<x=A;vQ_D7iRH3;2UVc#|!DAc84YUh|JP5$mlmE
z`Y6yxA!R?{9xChy6s;4~&5rmE97VT<e}ltC07{E)dmW7tLtZgFB%eb9!}Sf4K9qcy
zhC~`3NQ0r|T$+mXm%>U^)L~!ZKM8xkEPcPP&%58Pa?%V+%lUia5gh@_^$c^)IOjYb
zQP~DkvwcToa~8NSU4c!hIiqO{40F!p>)`G?;X++00>BnS$(cd`c_Os}Dksd7Pgv1b
zEoiUCLf_>rhkXa$$_1NZ`7%wABnzxx$hZ4_2@xX(rRAv+V0i*4t)Xx?ZfGc$6Y+wb
z{|Ns$){j4OFda&|Mj*BCru|0GJg{>6rAiSl@j4tzgfUTxdNo<ACntb9q7t>XhJ#RC
zq5pshQ1YKq4iDhThp9|CY(I+I<OZR`p$>h_M%2;+Oy4W+lz`IDQN-@c+o2$$m=*{M
ztsJQ#W1EB13XoPDsBWolre0SJ2}jWD;7Di=(ON@E8bE6kkNNsH0R8+UZWuH9=RXtf
zP+j6rb%{SFA3lNU%3RWf_!7?(?l?H=5=ruaCto9zA1c1CdVTc<t)JWK+7p9Bp&nOB
zCm+}jL#ILLw0~?Q0BI=swCIormBwGEY0{+!eTiQKGl&69&X%#(LmvkFT<EKKw}~m0
zmGu)M3BMRX6c_-M0piD~4fQh@5NLk3%vHp^#CtS4Q(>+qMS$6ouP{r15mOxG(kF?z
z$&Zm1eTRIy7szHG8;{ZYb}W8~LQ9%hJPiVEhalS(!FFhZ9?}?aDETjFJYRnil5p0c
z<b5KEu@<&MSHKLU?@&>yKDJYdF%9c<np@BeKy;HG2*Ag4MU=CPM#-gC$%z)&tbSN;
zIWfdgH~^{NR<4!i3jH_HNO7eaDa!s!uaoc?C}5A4%4=mUxhz$^RU{B$mMdQ=l0zt#
zA`!HDih_9J`Q&CIkb~NCQ3bo^sog0C?$nOw=&Cem4ke5KlsfNB5eOSD^PY&$5!M9r
za;aL9zUYs5f6{&Y*_0WI$}GA+h6ZN85-`Q=n$B|I{}EakHW1<Jvrg_*y%Pvv6vRZR
z={ccP`(nCWr|?jk`654Lq|O%FUQKCDXOI&bj0tpK77AFgDz#;#tvg>ywY3%D#zaRp
z;4()B>99wfPP!kVkFIR=`lR%kp=1qJE`fpv&<h5rUx}V@=Ib&<II4UDO43&^=~PQ?
z2UhWIYPcod69r$Om40rr@;LW;G2<9YCXu$=UdC?{<m?IX^PwcoY!R<fg`<?D{;GA1
ze-=q}3`~>~2kNkehfeVgLiQbUBWkCQ*&GHaU~6VB)m8Y3chn_&Ir$ZtN;u*Hh_+R?
z8k8%J(F#<%y~$!$Luh4<u_VQUN)(S8^9YuxA4IW-t=3!S9gbfKjIha*JhS2G@Q6s#
z6mC$41z+Ol$zfpx?-4gTWzv28N1_sahbpy5F}74y;sD}|J%k*EC^X~}i5g0}g&SGH
zsY(?8JhjdoqKJ$`$!DYjDEk2_`}i_4H8CwYM(#kJh3Y3f@HGrlNzcpnyu*uoo_DOK
z+f>D2E%KL>uoeQAkOYV^Ld;fZ!iE;M+U!YEgt|01?o_iO%lRL}jzVfoEVM+YM$dzg
zgBhaM{M?Q-yjn}hLKLM%c_EA3qaW)8<!sIYPI&fLNKF;;e<CL<t1*Vaqe=6HN$5Ft
zkaOArjyjaQMu;Lia4ZzQ+XSpuQ>PLl=O-;Cgif85#FpKigU*sI7q+~e<Q1d3kb{Ad
z#xwTT!w5sNVnB1nZom-36}$G2vSKM?m~Gh&2RD@zoWB!_SkI@bKUOzz)$1O^sJ^B)
z@l16~ZQ{-9=IX@TExv&TuK?Xtcj-G&CNDYqB1$>%PIml!&$GNAUzZrc)UPJ-$Li5p
zzxDMUborM2rsrkUF7LVxG|8*JzS=yv$u5$bj2d1zFFi-Io}3Adrv6M>V9Ekh7Wn_x
z0xtZfMP2F+gS5b6>jS^Iv44Y8h5H@&h#y=KGcn*gCBKtN%R^Hl5<_1BpvV5x@u4zL
z!-pQ@pMlQ|e8glGf1-xpa7Q);0-f%qZbRP0gR!{V-`U=}b*XWgb?L;`o){6g1OxF%
zbm>yF-o(%A=tp~IC>#q$<EX=&>}B+u^9Gbv^xM(VLEP^-idhHXK0y3%$tZeZbaWrm
zuHTQ2o<#ayj7)JTcO39ez#I(0MZlj6SP8fQkbY0U9<Ud1JK!F`djR(X{vSX$D*rNI
z72w&J_-q7R1h@n6R>1oJhXH9HuK_ciBw#OK9t?I5U@_qRGy?*B60m~|iTU%bfQ5kj
z09OMZ23!xA1pF*u9*lh$uo$oi6L}xt8o&;~>j7^CY{cy10YE?CFyQ9_PXIm)Scrxi
z1}p_!iPiafzz)Fefa|a;vKR0^z(K$y;8DQyG5w;(?FDoLeipC_@CCp&z!_L)>;zm0
zcpqRl;32@f0AB<=1egQIcnfeo;L0~fM{5AL1GWKv8E_}y_W|z%90oiDxD<P`F9Oy9
z=D?|FXLml}TYx^mqW_0_1J(oH3fKv_4{#^oVZgfplYlcY3!MRnd<?K0(2a*r&46`)
zalmfCI{|kA{t)nIfKLOSj?I)e09OIdf}dOqSPu9Nz-GY1fcpU}G1bJ2HN>TUBjdWU
z(Qx$^y5`K7p0^tvjqrFXZgjv$KTMo8)5RA4eB74f)9@U8hdN|_;mZ7?)dgp4&D&vo
z<lLo~Eu4QLup<9@z<s|#8xn#qk{icoFYw)zpD!xwHhf;hREhSx4R3zor?Xa^IjtTF
zAxZS_;PY$H>1!?)eYH)00_dNCF7#FTH`(-8Kz|N&@jEd^zs9D|fIj2VuQ&yr)N<X}
zP$w)o{qDJ!8^#=EPqG_*T(IYt(6@35;3!H5Utf0iZ8<^_s^^_3=kw##<A<Ps6?DkA
zWwhG$cpCKkGt|?KEM!Nbe-tq1E3mH8Bg-d*5DzWe$QC~0xkcl-*)Hcw#3<vX(b1a$
zY#CdudaMV1Bj}gt^y}^X+d=ODo$O-ezs{!L19~Uu;&)$4$6A~I5a`!~o~b>bV|tk`
zKV;{B8}x4EFVN{LZTfkLouvN~o!)KJuLS+Zar9dcI_Y1c=MUQXw{!lhbb8#T-vjy{
z<fpO1vQO43!9afU5a@?NU!>E!?EKGx{ygaE>PI&JA?p7m=x+e#d<_9t+1Lwz{6m&i
z&V2am7cq9ElS6vY?`IRBXR>n}=y!vDnXaeb)^jK5Uk5#1I~OC*da?uPJ-E-Ar|DTc
zfNs;5n-#ES4MWxo;JrXEYn`OKfja^Ei=dy->8Nkk3$~0|7`q<8SXig?WUUYkRNr#Y
zp98%{r$YvLKGB;&H!wyn*6CqeejM}<g8p%xp4DK<zY}yCgVWi=jXXaDeL3js^!!u@
zEfyXD&ld1xviV8ScYvO$Zx+GKp8@?!y^NS$hf2`z1N~y1zQLxi1N~c|XY&1C(D#F$
z$#?gFPV<aRZLlBow?MDZ%kQ+yKLR?<OXlhHYi;^T&_4irIvZ0P7GWNd3woy5QVDtv
z=pkKx)<#hP<zEN-nsMa!f?fssN<IHgQa?9vdqCd=db>_vbz62{PS(eWi^_NiJl_J3
z_?@)syDfGdp96g#=$UNvHs}wJlm9$~{f9s=(sfvC%fAxzAA+9FhHm6p5BfJi-=gOy
zKkbk*<^xZ5JPDo$ai7zKK&0wHeBZXq+6P$)%onfIWo0!A2C~Ir&<}xrg-+MvLK5_0
z(5XzT3~j8*!*KjG=(BZton3x0=#PW$)9KLhMzB!%KG0tSJyR_206iaj_bGJbk8u85
zLBAaIbhae<`#}E?=qddH`9H<^4}-oM^i29ELBH_X(NTO~)~<h6ji{g-s65O&*Mg3(
zrrC5Ek_cl^th)j{c_<@Dh?V(;w`bMn7ya*SZ@zmV$D3b#TW(E$>8Gdp@+<qMugb3q
z=T}zemsaN&FVA-`&o5e@U${I!PrOGV4mqWegAin0_zvCKSAPOa3Rmu{dB*)&|CEas
zPn-r8P{ap!cVFb1v+P>Kc)~T}`c01Ocy=)Yb?yaE<r=@w1t+FG;zEwRmH1|GkaFLB
zj;r9qxm#VIbDfT_?|j##@mYmW-Aq|v$^ugsn6ki>1*R-8Wq~OROj%&c0#g>4vcQxD
zGFd=uB;flPSc1}}-m|CHpiW48q;x4fy#tW084}<jkG$c>GKO!NOpEn6Zq<7L=SZ|z
zN7Jo(2bp^BAMF*;h3yV;z4`8Fgl_g?50x$~f5k=nE_7iVOk78pj-|P{W-=olCy7h!
z0pU*UFEEY(XERQQB@#P0;wt7Fdfr197iXw<sqN2C#aD90mEPjn5?HWpAud(E*y{pD
z@!^Z&qJFTeK;etMBE5$Vi$}R4t@OmBLy=z1bUYFhX>JZKZSsJ6pSn|Gsas;I7&b86
z$grE?4u-oK?q&D@!$F3_439EA!BD--u<!yyTyBP?467J6Fx<$no8b<IyBSI;6aTB;
zJE<h#1))pb#VxHp;dqaG3EmD`y1>&TQl3whm6nv2mv}Cfc$FhF%tr4jQVp%K(c3{Q
z)8ey@b!qWA#`d)MT&o{x;?W1D8LI!J;-`!KO&v$~*>F|me>yE2-md&M6@R*+`bR4M
z3`6y!R6Ly9tz+bUHXi!A)8g}uy=mGv+bA#|NQ*DD&TY_3$;KP3RoqI&V^_0E$H@C^
z;{%3zFTYOB#?#|HD|C$FpKZ9;q{E+W#Ydf*jcB9dVk*AKNY^j3jdQFxmdcMP+pS~d
zeYW;KaGjcMohD&pUAWCLUK_PB@{Yn#7NU8>yi^K>N5b!L;HfzXuf`jdi=H*PpwIrO
z#87#3(enjY8as3WPjb}zWQmuqZTPs1vG&{pd^-KVC-`R?c~!W<rON%OxSwg9Rz+AF
z|66fC(>T3~ur~e`aX-g+(4qfd1%9S6qY5{;RGhvL1-bCX{lY4Nw#qGM{8<h>{g8q9
zXFBj7XZ$S2EBU>QFLL1Nx1=QJJO}<qjCVWm&oX|#0}ta1ImHhA`Ha89fv;wKsRMsK
z<I5fR?ToK<;O}Pql@9!)jIVOwUuJxb13we-k?i1e;FmJK&Vg@be1il3S;jXz@ZV+p
zItTt4#;<qaU1&JcXQKmu1>@Ts_!h=@IPf<zzSDvK8sobi_@6UA?!f<<@x2Z_{ZNtg
z-0r~FFn)&v-^uu$4*VUAztw@?&-mLM_~#kF+ku}xP3m)}1HXXrdmQ*C#^2+>-^BR6
z4*WjG-{-(T$@qPYSM#|yfTy_hjbcMw;&~PB=3@d$cs1Wv^I~+bv~h{%w=Rq$=dF=)
z^uOui*h(=$9XlSKb0L*Gc0B#KkUw@DX%%wL9y9L57_a8>N}o>)Ib+A2dw{<X^$nFu
z&N9xB14E@dhoT60;{W0z$=J^PpJe<p`qCDzD<v>~1N=<B*TFUPoDgz8beY78=ij)!
zi}5=zO~oH){4?UqNQR;GY&u=a`8UQZIiCl<5UZ9V$9Vch<{x4CoXvQL@y%SXm8|SH
zA<#{$ABUXl&yeM=@kquCnEyY3KZ|NyB@yVJ;(CGk+Z^@sqRG?QXEX3*pB)Qje#LV%
z1OK~%p9|B#xmfS0yc*hAh<3-QW}G%(V(GhLbaiCle*k#WXI7=;dr$&nX}-iCxlAH%
zW;~rAm9E@JfIlniT*Em(e}d&aD858szhiw?0hh~ubqUMg#(1aw{{(!xa+eiKIa%V%
zGI++#at44;$N#Gg{Fk03<+Lu90_s@K-N0Xfda3uv!|lcO@4%;%KOIQ2=fD!l=VSg2
zjNi@nRQR8x;xn_x&U?Sj{1xn1A7(iRfXDwa*ID>dBbD3DetUvBB8+$T$L}%zNU3D(
zVg7koaFd)~ZkLN0zm4(V<aSg#9A^9)#xpQ{vt^u7@6lIwyAF7g?_MAUbGJ11Gu{~=
z&OAGPeJ{y?zZ-ax?~Ete*f=2kCEV{m&h-j0{vWtqKFatXGyaY%Btm@e0=E^|NKkcW
zRsm{(C;9hr;88r^Mx3Gk=$<FJRXg6#{J&&BQS#@VlU{yD2K*;8;QuKD{(j&|&wIIj
zOSlpR2!({dmi+<UO<bE8?~HGc1F!1HtOC3QJjDTLe147PoG6pr++B^2Vnd3`-NgPt
z-zB8$4~%!l^IB|B5q}mtn(C(q8UGmTuXMX&uH-M`c3dLG8+QP&co-!>E&@opbCk{l
zp2}@wIm&KP#y7KI4!g#qjK7BKrR@0+@CTC9%lvSAaox`NgO2|Gb>L~dbB<GC#Q8$3
z*9N03fWZ%hzop_R^)}0Q#-G)0sn30m@oNw8>Fo0Z=3l!MLUAdcb1+a+y`EsdQvLc~
z;7L!XKYWY%o#Xq<z$=-u1o^iB^C;>^!(48ViRS}P<<4OJ(f!4B3FD8jpM)5{lks<O
z99YNrcNl*gJEqd14*izo-_3r-VZ!(V@QQ~;2=Ff%@K3Xxjb)Ns`R&DU7?Qu74W?w?
z&iG;Ww+1fcw~W7+_1wbvnv0~I``JFqul6v$h56x@;`$5Yzr*^}Fn$05pY+M&a+Tk{
z2)vtlQUJy;;*Ya@gf(%QG_IvE%EVUYclz67g1^wHS|<5&Iqxr--x;4TE0*Q%<o1Pm
z#dSO5XR;ksTun0G8Q(qvzajnCaok36;`$upUuHj1?fw_w)1Av!00xq?gXN%^#I*tV
zbavRz{6DLdeAhAm3&6YiUI(u+znksM)izpD9+i6`^rGtn5*V9-ck{gtb}@f3#~*HH
z<9Wt6GD7LR5^0j}^oK72uXuF%Mj#rG#e49}(|}>0D`&>L%mAG%7sDA(Z4tA*Gt%ns
zG~4hir<m#Q=`{k8uI|oYJlIxJS-J>kOr>KoaZXgo^hcxqZDueWk8Z;;QvR->+1At5
zwGCOc6n-uhil?v!I)gZg4hK}l>DW2?T~<k;yBjBZHO1>ZOS(hBKyYg)7BmBqFwQjV
z3E-eDL0Tw&e&vUv5Z5k|i$d+;NEF9a5l5wIh9XoPl53XWc)f5BonBX0=0PQboBTbU
zaT7n_+H7`5IzxeN@>sSyPbp}@sGx939@NN;M|4!Fzb(=&vy*h3ZB~Kv_=3GsOMf^X
ztE&hFa0(nrO-o)Bj>LLeV;Sj{kRR&8*;^So76&4oojAQKBZH^ZyQ03jDG&{H$1^fj
zi1Y3;k}HF~aU5wDY|F^8(7$zaMq+t;G}xVyS`i$V>?t#;ZMKCYT_JyGMk$_!U365Q
zYSUBlg(GpZHF9cs#20*ze65kK@2TA2IDAWFHT+@WifNt3Q!aT(wQx`8ICWSg8PRFt
zIDIzKliATc%-S6dZZZ8^a6DY*nv`#fL?QUprY>d1*f@^kQPRQ@beryRYQ31HZ3;zW
zaWfL`9EW$IIPDSwPti9#%opF*Jx&iQlU%`Az&{=v7ut$L_M*W}neE4gwno8tO0I5y
zG#EZ57s(o@0_Cy-IAih@jI3JJA8sE<9CO9G$1SwNDipmx=+7K{Jd3y$#2|xX2*+ty
zRWzMs$W}pwZwtoZu^1@KCJZfGa9UxT7)yd?NQA6%1bZG^XedA&3Eha11_!PBaZ+L%
ze)mg53Z+X!A;Vl%w|04T9kTSanzg1l=+HD*HdfbrP4Ai-I+oSMIm@Qk$4FmIqhZ#3
zd`)$I?F!IR2Di3UggDsMDq-!)l}%n8u3Ejk&dW!TdKTFu8XwGxV?8q;+nMp?O|REW
zqZmsI;+V}vW&;k@?BYWntE$AIjW~aiPF)m-ks{JYgBT1koS6RFhKUr4LZB{+@p^G&
zY_r*fCaE+VaVjhifn{b5nLDzL<`uPdt)*TsMrF|ik!=$z)9c+L3KEt0pEw_tO{dPg
z#C!;&`ve=Qrc4}qOM$-3yhi-6TpS!bp(a<R=aY0NS18V^W&J{!44Q5JxZf&gf|Xw(
zDjy_KmU~Q=4Qz9gx=K@f2IyHB4sMl4?$Y#eLiYAnTHULw;c7adSDRwe*}j;Ec3~nN
z!r9HDkcsuF_#*knfm-fei_p8gs1Z)qU5tKJyB5I>N4c6k7--*TJ6fmRraQSfrC*RD
zS8L{ef2~nYhb>c2a-N-BRizz9E2-){TW18EK)jQUP;REwoX)nTcsj9TtW!kE)}*t%
zd&1Vp6~x)Xm?_EN=~<{~5{Wpnuq(0!Bj4ZUb6x~)6~b4TP0KlaD@UAwspxcZF)l3W
z?um6E#$f=516PpbagHl@n8R#~Ma&M2+H?Rq@>QzgWMchyOGC4$ndYaH5f)Mg-Ujqq
zae}p0$otzF5LP)f{2eEvrx;$kG>s>d;#^P=Mv=}9Q=~ghKI6QGBGCJ)n@8!Uwbx`g
z#S&}pg5vmUd=SOS0c*O~Vy<M=e5?e^h7xR1oa*fLzv+B(8u=XdS|qxTr_@}bc2g$Q
zdsd6?tkpnla!f{u;7L(ad|_88T-yb2#ssg7{yTi;Z?Z%t&$+tE8espPxP}QwJq}k7
zVlSx^OEydh0z%sf&W(7a)h871gBK!BIQ@{u?a8r7WtYF#jM45DO^pydunhCJwMB!m
zSY*?LT+WN`LJJuot_<syvBM%(E|V3eFuNk_$c0Qbtz{Z9h+!?`r8QJ2%-zabLcx0`
z!}~oISjJ*5XD!Cwj}?NLd5g&sR(n`;JL}RE?~K)E48o0MVzFmvMzER-`nzhej4d_2
zFj?c(nR`)Fxrv}b7W6t|l3YYh<{~KFdc|67O-eg?&`xT>?4`9emb$dsl@1Ck*lQ4)
zo=(W))vQTdCr;uHfj%Zxn5)Hnj7BA~SzwJ!=Cy5|4XJ~O=z!%k(X4IE+}s)c$Ln27
z=2;XCin%Jry&$%^0zr6@w_I$-t%V^)_ypJF?T9$)D+aK!wH3Y(^*n(NJe1&Fn1B~s
zC}X@n0EHn1u|CD0p?P>~mlL(7DbgA9Hmq!HiZq$29RSZtf|2fEx{xxN^|#ln)WXhL
z>12#AnA(YT@T5Aw-H&K3FS`P5=wYZ0bZj;^Ve`+9U*%W?HrKCETXo{W%j9jXU<|8Q
zy9Pn8u;3&`{lyTR`Z~c{yF=>JfOLxo3O<VwRcS7%`pqO(+7c!9Qe+cjc7_4C%fC5j
z$_3{pY<C2E13^3zvlgEdi4D}t97|ZyQ8B*A<yupmo&{|6N84)08*IjlKA1+N4?qYz
z9Dz9cSXKYKLpYY|_1NjOwv6O%EKQ7}?`^0JjTve)n|J(?7Ml}+NRJxy<wm9m>Gpbh
zVp~<%_@&KQ%mQepgb4v#(b=Gx6(6V7y1P|unUvw1o)+L4T1Pap)gH#-e2YWjIG#yj
z-%hO9@Eo_cwXW6U6?MVXc#L;Ws4{J8R*VOiy^b}G*j%LmhgXV<Mb+QBNKUip;|bPe
zSWYcXCMWQbuDov-4T{!C-Lpggu)6z%ERBemsk38tFQ*&Q%#&dp52m>E^@t1;!}0yg
zY>maVcAwzVQCgG+zRFM}CLSl0*W$U?7_WfYJ43AjGZ5<$Ppx1DOj{aw0G||V&>Y!X
z28z`t3c=7(?p(W#8AINWCs43H#w?fu-|Z0bd_qwUkL(cPL_Pk_c*((%-5+1EF!As$
z(2?2!(rgH&GEhrTMq^d92HQhn8DlBVP2$kE1VdkYFm5(?tI1O&I+?+IA#5k)L2bB}
z>U^f(^i1rbBBsY`=_S|yDGxFyZ4|JcTZn1uB(8Mqy%*K#v?*j)Xo5zu+>Fg5_Kv)3
zn#^<$yX205r=l&iiI;b#SlU4$ui4}i)6_}cW0fOQZ8OcY@Kr2Kn1WHP1?a17&Hh&L
z%pm2V)MTu9Y)d0<wuHBa(2E;`v7W9V)+8%dm<vlvCvS9-%PTu-(|7Kq^hxZkYJ8vM
z##yYM^22%L&bJ7m!5b##jNVo_(wNB@t;glls!7&f4XP+}GGcAHwJ(a`J(){<+sgDM
z47HOZEWI#b?O9q={z)1-LcO#WmVPPbt>SAl_M+VSjtN@D3eJ(%Pv8p=*sz5Xf9njF
zc3o6}rbiGBh|-PXYbIuU7rw|Nm5_t9r*iBz@nmii>pR3SG;NkOp=1g}8G=w#JRE#~
zydKW<KQe4ja__RYH&dsq6pB)}(TtMVwywCp6)+x^utVL^!Y$fulwegCEWtN@OS+@j
zREx&9X{gqoP-okMP#dGFm)9<c``Zn{-r<jR7$t4n!YD$*cvMoh1fwy0KP)9-f(Bpq
zQwF~4?u;8H^q}4-!EH%<gl}TOfKh^db|gi_E{R6OmtIPO9sIrMjy6bBDHIb3n7o&Q
zI%41wQsq~sr3`hC!u?$#h?fGffG|q%b?q*E1v}-7-xEvVXFi4tpDfO<-oK%sdLNCF
zPQ3a>-}|4AkIG+}EvX8+Wzzbqeut(I^s@^+|2odEp!y!a;?Gex3~L?v)q6h_bUXN+
z`9B6aj%|o2{ndMR6y%==Snt_!>hDMXGw@OQkFfp<Zsau8k!+{(tKVsafFl-_U%l5y
zLG>OVu!zfP{~Lg%cjT!2>b)ci`Z(V-hKf(YPXSNw+)-)u{t^ZEaz4_Z^i=vY+8DqM
z*;M6M?>SL$2P>?~SNT=_zrgvIvq1Ge6b03Lh$#O#_&D=_1vn}~$v50c(B&3?@yn;w
z%h~?-fUM_tyCuJZN`ED#kZ%<HCenI-^&S=l6<?<O4><Cx_q8afpRqwQlsp9=bmUjR
zb5!smWoXKbOXZU(oNJF8U4Qld7X{B$j0~OjdyMm|_BX1Sk>PX_u3f7A6zXTVAr_Ti
zy-!9#I%`M2oaH|YqMl#9cSgZmPuFwX{-dxjIr6Lb(<tbSvCi`UjQseCwdflpQ@tNx
zcs-@Y+FR9M!E7i`zwzQQVZC?YDCbvvLKyz2w1Nf5Prk46tM}q0Rrzc`>c4cUe98~L
z4IJ4|<yY?kJ9#~EgP(}nrSdCS1iGHz`kmxPnaSaPR{2Wghme_Msq(A3Ip0PJcR1R=
zN}{d5^SJy{3nOoeHcE(}Ri#{{D_uHt6d~xojRF`O`G<V;yFC*htKO0<-fyOKXOsZ@
jdSvQR^#d;^s&W-Ce#UOqUEv6R;;S-6XV&PKBWe6M!5R;E

diff --git a/test_scripts/ofccl/clear_static_ofccl.sh b/test_scripts/ofccl/clear_static_ofccl.sh
deleted file mode 100755
index dc8646c..0000000
--- a/test_scripts/ofccl/clear_static_ofccl.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-g++ clear_static_ofccl.cpp -o clear_static_ofccl.out
-g++ clear_static_ofccl_time.cpp -o clear_static_ofccl_time.out
-
-export DATE=221221
-export OF_ORDER=1
-
-for cards in 2 4 8
-do
-  export RES_DIR="test_result_${DATE}_${OF_ORDER}_"$cards"cards"
-  export OUTPUT_BW_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards.txt" 
-  export OUTPUT_TIME_PATH="./$RES_DIR/result_statics_ofccl_"$cards"cards_time.txt" 
-  echo  $(date +%F%n%T)>>$OUTPUT_BW_PATH
-  echo  $(date +%F%n%T)>>$OUTPUT_TIME_PATH
-  for n in 32
-  do
-    for w in  2 
-    do
-    for m in 1
-      do
-        for iter in 1 2 3    
-        do
-          export INPUT_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
-          ./clear_static_ofccl.out $INPUT_PATH $OUTPUT_BW_PATH   $cards 
-          ./clear_static_ofccl_time.out $INPUT_PATH $OUTPUT_TIME_PATH   $cards 
-        done 
-      done
-    done
-  done 
-done
\ No newline at end of file
diff --git a/test_scripts/ofccl/clear_static_ofccl_time.cpp b/test_scripts/ofccl/clear_static_ofccl_time.cpp
index 4c49834..bcefbb8 100644
--- a/test_scripts/ofccl/clear_static_ofccl_time.cpp
+++ b/test_scripts/ofccl/clear_static_ofccl_time.cpp
@@ -5,7 +5,6 @@ int main(int argc,char* argv[]){
     
     freopen(argv[1],"r",stdin);
     freopen(argv[2],"a",stdout);
-    cout << argv[1]<<" ofccl : "<<endl;
     int ranks = *(argv[3]) - '0';
     string str;
     stringstream ss;
@@ -28,10 +27,9 @@ int main(int argc,char* argv[]){
             getline(cin,line);        
         
     }
-    cout<<"ofccl test time:"<<endl;
     for(int i=0;i<a.size();i++)
         cout << a[i] <<endl;
-    cout <<"**********"<<endl;
+
     
     cout<<endl<<endl;
 }
\ No newline at end of file
diff --git a/test_scripts/ofccl/clear_static_ofccl_time.out b/test_scripts/ofccl/clear_static_ofccl_time.out
deleted file mode 100755
index 288fd07ecb83c2d1cdc5844d23eeaf63ffb80433..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43936
zcmeHwdwf*Ywf~tsASx!2f{o&15Y(s`Ga(5<@tQyeCXzs!hoy=hCX*o<O=jZE1VVpW
z;|<(8#<bKQRemjPvGw+HTibeD>qS3QgHXf0zt+ZHucbcP)cTl+));LsZK?Tv*WPQN
zbLM0w)O74WzigPSv)5XC?X}n5d+o=0>^t1Gt1}!9O@&PDA2f<N7fDPd6ZU?CF90gF
z3hi|Kb!z8pvk=b`I7zRR08|-{W+cKK#%BUjx&pdL0?(0XO+lq0K}we@jO0s-f*Qvu
zT{e-N^asHy{u*${*)p!6HJ!+>lKE}$wp@u<(3*~HGrUpKhdUCXhs!-Clmd<QBy&-^
zIH!wqItBCDo(d{^l8*GNV7Yea1dW1I{M9sjx(%Gp4y&18LCS+d<7?5@>Eu_>>E5pt
zVl+Fvhs&p+%I_0MM|SyZ1DjK8;QUh6Lot_IL24%yuILD~l$T!7(Rz7DAlTh=c~3?8
z<>jSCkx)^Ipp#yd2UXvib<G;o9wZ!#6Lm3<<3vCI#G5a_AAD@_qMM)o`sh<De&>9n
zvrx&PJc)+_rDqcu@$+#-JhJ@%mT{+J9(tapU4Y*r{0h&#=bT~xGo6w9W@ofDd-Lb-
zp0f(*q~UT1pPPj8XJn8JKQ9IUw^QKHNFj$xmrQ<b3jR76A{l=y1^;JL@c$!>oJ>w6
zg+3RhDA#BT{K6FZzL`SL`%>t0OA7pNQpo>B3jME1!GCcI{Ie<W_oXN=wL2$%_Rx?*
z|L>&0Q{Qpgg&=-a3jEJg*z@y{e}=X|tL&1(lwbQQ@E0VJ{~wS;{CjVe!~(=A+{SY1
zZE}WSAL8$2d_MDQ@DGt_tM9VQ$S0I*y580q3hI%lHyqV<O|NOH(_8&ve_J3D^@p44
zR&|7e{w8lrhhNf?Q?74}mK5uHv^^Z!s(1K<ZP9k!9}b7Y^=o~a4r)iF#@85ixqR*3
zupSM215qH{jb8k5H-NCT#TyCu^iU)k_Io>Pn)Ie(-QBoMZ&<soF}h6GeLX!cm#ai_
zfhiDdt7$CN8_V=2m+p2WkJNlAGhXF#ZxPy+`h%?<$rPMIw{jS3nwhSL=C;HY){?Oi
zPOkS-Umz$2RRls29Wwl-HNil%S{7_wW2htIu3z2I7;4nXMldvNDQl_4+p2pzIzm2}
z0Os}TEvWOL?(;@{?O<)~sF$``*5;35DDfvV&!qZ8qp{Ur>S{@-KE17{NAL27BcY(T
zBM{xDZz;A}D6#IRvQ5KUX<yOmCO5idb?R$EYb(=bYYlGgOlDNsT7;>>eqqDuX=qfA
z*{(lS?Q{CZXsIvM9p%bH12(FzF&h5sdfRks3ofM1=hGwPL0p>N9NZc}_6`0>cc-8H
zi%+>0cop9*y085feX}>vp~3G^dle@1M*Tg3D50njo5TK4mtWB~2dEKz8XiX{Y3!Uj
zy-2)f)hc~yQL(nBwr1t34I6M>T2xZ5;j(IFjqWO1YNc4Oi%MKFs+Sa%8doJnW$>Yy
znDu0fzghU3iTO$<<}!qp|1z}M0-r7TXy%l|bDb=}3}EReg99_3YDCmHQGe04(}@2`
zW^jO$Trtt>@!N9zevDYNv>{IOU*Ee08DwkEb3A&<y%-v@wBK<2R>7aCy=}<L(7a3|
z4dw*`r)$vn1mn8T4xB<WvspY}SMzx_-e`*{m8Ko#`MSav7E1hx4L`U@;&XUhQT!@?
zBqZrH=X1(S@pssQXH^9Ls0DwPg@43?Utqz<EqJxYBJ!98f3Ai9xCMWn1)tCQQ(Y8T
z@M=6J9cj*>f*M~5Zylf2_)2&-jYy)#X~HiQ82Klk^_h(*<*h=21&;<{1g8a$A>9at
z7CbivK`XZ4t?P~o3!d^;q0)j^bxAaj1&;<~gnA49R0FGN8!UJ<cq44G;O7`vO>4K{
z(O`|xWx?|jRnU4Y_&kGK(|Rp<>QgG*Wx=1JkbpZac<cOnw*~((3;$jVp4LSw?6ctK
zD<t5<7W|nO{C*4m;}-m&1<y-)3w7pl#sV`In6bc&1!gQTV}Th9%vfN?0y7qvvA~Q4
zW-Rdkw*}tIJMT}PzPEBb{n@`<kL`*bgVBtMVNc)ToI}De6J;-~)3ou43tt4tJ6|LI
zcA|`r{$^rgVyB4H2F&>AvqqdYUdBfc8gbfi86W*OBTgGF<D(B4aoXG&AN_(6rwy0!
z(L0SeZM2M!-fF~YgJpcwYs6_|WqkA#Mw~WO#z(7+IBleikGhOFZJ>;gE;Qn_aWX!7
zh7qR?lkri95ziCxcRx_&r4164--y%32+D87X+s3%H{!Grg7O=2+5kcMjW}(5p!`Oh
zHat*%6^HfD71poTg#8h3uA|y8<8`*U+ZMmt7B92KFSo@nw8hW0#pm1Nr`zJ$w)kIa
z6ZQSQE&jGG{+ccRvMv7Kw)jtN@gLjbKeEMtXp4Wx7XN2!-1T-%?Ae<<u~$5Oua4C>
z)edBrW9!K?aQc&|D9^yv2T@%U^Y2BH_jghh7oLIa#|bPy9-WV_6{Kr)ty3mO^3LxC
zyLN~n_`e|d%kCy{$p@a;nCIZ{uJIf^p6PKs?RoZt=sXB;vw)n5k<FssReI`wy;r}D
z=Am_8*6iuKdVDP|J+arKb3FZ5UxZ}w$Nn-g5pP9femeWd;B(xFv_}4vesn8{bkU4-
zjh;K-@}L99eslooJh2}>4AAQ7&$_Ts2zJL^gP#8R^GKoo9Ypl>4Z?k($YnZql0C7Z
z_!_ia@QgqelP6#DT+KX-1W$s1*we7aF(isVPhtnkKE}L5LX)RphjU3J_+r0^|D2G9
z1_h9w_o$Zl#9&{%IJV;@Az+7-GJ!&`%jhPKUM!;{1VH^c{F7&3hsM-HGFr*eZ%Jku
zB@F{))aZ$ciJ=`wK!a7Zz8}$5LEh)dX%6l938VXk9({wSE*Gdf&POh$oP{K33FPz%
zs^i%3(2ik4AYEmN6i%TC<@^rW=2Q^+1f%2FQP02^1$khHEX?y%o^trtp&ic|qUb8q
zmidDu>U)1CQACEQK2aLSXfYZCZ2tguQIX}wYr(e$N_t{s|F6^-_HPgtILf*7@1QG>
z4;ry5L-8W$;aPqt@7}@fgHcqK<QO(Mgt9JT5;gxWWXBIcC7D%;nbjLYPfG4Mb{}**
zQ|dNI3BGM4aF_`mC1ij0J02zbi^RIuV11qmXh!{nRMj#RKk+9K)wuG%fu3OKI~3m~
zqnnuej%1b<G!!31bU!r*Pk&v!)w6Vt=rYmHW1FdSd18kw`Rq!}=c>efmL%r$Suh`)
zZ3<Btt|uX}qg3bBC`SC(6u<K@b>ACqJfwOwhWGr~?I&X0$NO$S;pi^8^Y-J^YP&B%
z^*&0~8yC9F|Kl1eYuPt|g6&?U7z8f>ap!*+!k(a{Ptt%N--Cu?$t8CCaZjv!%+v24
zn~dE#8AXhqes~4<aq<kypNh;P)UhnW+;~XLEXW~|`W_Naj8N#S$Quf!q&J*|ei&+7
z3M52Jy^#f}1Fa^jV>PB#n|&~a((C}Pr_$`1sp$RnhAA$2$`c#L0FW&@0+|Q?l?wk1
zm4CM-$D?S7Q|0(g;*Fu>k`RxSg2G8bU$zJe*o>lVZWyMX6IbS=rlPJyiOpjya*Gn#
zUzy_>sCP=W>vN7>Ag_kv_X>>;dis&ZEYn=H*o1PZ`tDcxVrW!Wv9=Af(V>8-2uMBn
zO18%_Y-?}G%7Rg)6ABE)?}7_Tn)?u%5?Aj~B~3bIRS#8)J7l&tji|LvI*o=EYmR&R
zZXeU4AD31`w7*VSY(a{=6SPf~WtdL*&pA?nrG47C-EnF)dBV-2B_L<h?wr`JH=LI_
z-XEv&9zq6C6$9uTF_Eo@<cYpgcieP!+3}5kn3yoV1@&E<tT;_>QBjm?o^+7e@rx2t
z?MEBYEY@s<n0A<RAN<Q+Ppo;jRiv1Xxc7Mm=AVXX^%Ino%KM)*h{xQd`aWtIG4~O9
ziG~?-4~uKC{cm9w_rsonX8hccE=qSqqL0cX&4Y~oy+n@yJpw8F0e4eo-zRq+qi%N8
zb8rOR7VR4x4gyeGblV$f%ouWu@ge>aA{Y~ImGq(b2Q+rl_&^#A#m~ip$kSg8D^XTQ
zJh5LA_Pa9o{hmJeey2)#I5FjKiAQt<DAzsADWjb7SV$!sNKE#SNM_G)U$P7v6H|th
zW*Fp@@%O;pH|9WD$pgR^L-Dyn0JTJ_1r$!0Czr6Iu9{I_4f&qSnvZx6{wW)5isfrG
z3z00aem-CB_r!#c7?hT$N`U1FAh-Jbow%T}SWaK_wto@rW3(TC<Y+n+|G(pq+IRbY
zt$Ptzx&9KRh$itK8kBHjq7ZdzMpQ?Q0P2WB)R-C$LUDzDoyO+KdvcDAYL0&v*F0|D
zgUk5E6z@CSZXLAY*{lwx?=5G7L+NKPV)o_Rpdh@M#S!EiDH2`AHXA29TT73tYOZRc
zURMYSN73uhkWd|>wua&~fY!u*?CHM|=wChQgfZiP_#eU@u8sY!HugsRv+rWQaxQ5?
ze6e2<?oDvi#^Tfhb_<gqF1(>?W7Un8TupV&nL#Yy>Q_l857>@Cr$Oknf3hb4X(;}j
z=#Yn$#{c|B*!Pk{p4bb(3}OJ2Q&Oyu(1*eP#u(^ax^2XZuE^1wMHKB~0A64KP&$Yw
zP#WrI4j@qd%zQO5FL#fG=PJxvQUsVSxeBu!7%|5|D(fsUJH8NBLo4uDTY;?h$^IC%
zZ~Ec`<XV#W;$alaY!GC-0@w~!&`lZx4#j_f%JcLWAd1F16n{`eG1kIX=n9yD^c^lp
z)W>X8Voby8mgW{T0}$P$7Y_JXw(xRh*2uZo$T`*wo7D|lYfcO?<PJdUH<fFlxkCT#
zG*YZkBgKh#tYs1{1`3#?rE0aZl$@6;-Wn1p>Q)rKLPUp<EqNkn^(;B@*stQZ5TOqZ
z)Rc=Nm?cl_P7!b?c05~Gr9pEjKK{GJd1rz^*zhX%SagA~CYV=B)#CJcJpBD}=dl+P
zW+*DL=>8ZQn0*Ccg4r!P%YpxK)G%xy+|}K7?nJ#~aA4%bgsbTuQ>uL_S*nw?P@4H7
zJ!B-#7K&h#1bvfeFv-z<O(<ZUvsg+gZC(C$qOA=NH_1D)0kdrhWP?3wZ>0O-`Y2?g
z*T-d>!P1Kg7emGa=mi7RuS8FnLr$HB;>03fg`BLbmt?9XT7gwOK^3>$J(hPrEvK{N
z8E<i~7c&k_LJ&7?Dg8GIviAhE^PxD+Y~im_gb|8Tf3<Xse-%-53`~?_2WzqMg--EJ
zLiQbY!fPk@+3W_$VOz{xY*pcD_k=_Ca%xv(D$x)RLbR#6(bc%%7_C4x>zgcQRD>n3
zNtPsEP=V}GVjjU_b%V(Eh*5i2xrd`GfDtx%hG#bH8lMzVn!*iAx8RBWEIurZ;6Ca^
zr;Iy~JuV8-ceujhDJJKtavXr4F^7;5h(bj!m#Cq*Q#2zQcwm;Hc%{hxS9H06C?erd
z{CTMW@_v{&k6lHkCZ^0csX0(*q4J3q_$~&i_@9B7Jcr_W$G{`^3+B;fqT;X?wU^_t
z))>ZCk_3n`f`;oDP1sPwMx8xFiZI76H5)P-|Bn-Q4U8%k>q&GIQME$JLy&_RqNVz|
z9;xpbOUQgmF}be6V<PPXoM_n(kebTnKP@LKH5fzCqN!|uiNjG$9^~wH5H)h05Jh%i
z{YBet1lHpH?MjH0|I1K9=+r?`?2GLhwC8LzVWYK^yh3Xif+0j}Jpwl*3kI27ur9b$
zF4$r&Si%@)T9)>Jd!dGy23LKucHrtmC7M=sT}|xys^*&5@2i@sVkeqC1D78Fy0P|>
zzd((g_|IO0{DXhVjGpg$k<-`4#xd!uj=fPeG4I!&zC#Yr@|U~cM7eTqxRIuJRX0`X
zhc<66$~%8MWhe5ZDi=+X?mHQGngn^~&x{3TEHGn%84LWKEa1R9Eh?xhOg97;n>cvm
z#(W2-64$->i5D)2nHUgG%5P_~q@l?jiJ?aT=(hiC{3y>;@T1%Kr{OmTKQZmbpRnfD
zoT1G=Ux)KbrzS6={z%jr4Rrdi)GjrGw#Dy@hQe1~sn_XvmyX`h(*wbXKOEH@3o@^|
z3;NeX?vh_mOdP=VLn9LtuL2$bJOQ}mm5GT50q6gAV&Z)$`Nr!L6J1cG^bJfH0P6tf
z0q+AW1>6GI1o%zBDB#n8y8vGYd;~BX<MBBtXer<dz)gS~0eb;&1N=GQUcj?QCngR9
zehKhZz^4GSV4T+h7Xuy#tOhK=w5}EKTEOjq*8@HXxC`(w;5PwZ1$+)L3x=JG`QBo{
z7Qkx2-GKB${XW3$fO{}?cnI*@fIk6z8SpK@*_bQML8Y7xxCC$~;0D0&1NH!Z9eWIW
z0DlK~0I&p`7%u?^08aou1$YiB^f|y4fJ-o$+yr<XU@ze9fO`QS2OI<(0UQC$#p;O~
zK{cQga3^3T;M0Jc07n6P0sjoR7x3)&ksfe6;0WM-fYg9q0(1ga{~qNAYyoTqydQ8o
z;G=*K0zLzH7;pseRlw`8Bb|fBmxnoYG2qRB^?>&Qb^$&PxD)VafDZ$H9QV|Q0h<8F
z0KWj3k9P7PU@_nrU_Ia*Y!N*IcpKmeK=m$+<I@eAqbJ|7V9xBEo#;@6$K7wO7wz*3
zK&^VVxaqJMm!<f%zl8Er$IH!Mom+5i-f3HNdbMlLzVgzgi!T6Hq~8cQ_%ie$1P78E
z#cx0GU6h^!<#i8!u2&}}Xop^N=jQ)Y#;Vh2)j=UdiT)6N7lSU|>rwP;P5MtjKO1zR
zucB`@>2HC4Dd^&TFh#%4q|bpqQ_+{4gbwkV<J0xE!jiMUTy~kJEl~C(yV36~*pu$}
zXXT-hC>=a~nVAPOg(6hWUC8G*==Z70@d)T+phLbXqs1)8bD*C9Jy|)ONJ4fb`X>Oh
z9)NX~9vL1Xgm`G7Nw)A1&z%;YJIs7mz(1`6&mG_~Wo$9Zu@Us|f_|};ev_I0Hqajh
zo$O+yzrm#M0sV2%>2W55USrY^fc_BZ$;wG~e2M8;LYV0TX8IGLKY{dlR{Cm_eh%u1
z^j~hJcbW7RpdXn+zm1@i{zcaGeltDom{a<zt@Nl#-vj#Vpwl>Els{vQV4(6H0R2oX
ziOQ|?PBZ;Wpf3SES^3E3kD&bLfc`#U);Hj6RXcE_J-(h{<g+*nJM0)olF1=G=snqA
zfS$_Et)P#BeyLSYuc_yD(BA_+Sv?mb%|@~V=-s%^T4d3)W+3yPtiJ3FpDAk?vWhX5
zEwtvfLDHSTje%YQ`k0lD@@BkZ%9w|7>jUuAT6r>72?i=}Dd?BdIB2Cq2DN;mH-Wwm
z^kr6h(3Bqq-4FVwtn`d}L;fz%zY2OXdpMEi5zu#nzQLNF%3$$@N5OL#JgIDc9Q0>E
zPt`XIVCMe@eT6lTh*^dT(8oc)$V$J_q;CM7<_f9Weh=ugFgHlmc6Wn*4(O@sU_a;{
z(95j(cbNGf1^p(_7g_1ooAl$LdqGcTW2(af%pDp*Pvu)GKyLs&V3nV-Nn}9jX^ygI
z3i&j5`5fr0t?6%<`Z<By4f-RXw^`|H?#b-S%J?L4Q64nkISwB2KD6q)&1M;C9`py$
zQ`v~-N2g)_l`1{Wn@$D2z^cP~Q$Ee7GC)seLnqQ~1pR&Fzr~uK+G)F#u^4!=<7&uw
z4A)tWa73yc#P@A8uYHj93-H}wm6g#T7|0e!K%a{p=p|OV#V^D`KM!=u)5yaz*5qI~
zz5w+3R(h?Oe<A1}2i;?(L&r~ph4S}+ehuiUe0e+Q8|`$Y4{`dtK;H&>GFy`TeV})M
zp3onV{xh8Z2<W>(Po;kx^w0|v6L>DxEPqC|D4-Ll9LzcQfG*zMS9Ks=5~a|{*DV20
zJ@W7qVs&o*=QC<@3qF_W&UFrCxpNEe$*#^V{-;@<+={;0YjP`txfNBp#Z|e5D|4MI
za|>4H=C91n5#LLQLQW6lzy%orkJg>J_Ga9>b7X&gw)WkOXEI-&r9CklETD)VuD*Ps
zW5HF|YuZm7<BpfJ9LF*X@uGJ2!e_I!-)4go(;g9!B4;(80QOVrFQ4VeyCQq5;|q>c
z@xak{92Pz!^O=ho3(Qzx#sV`In6bc&1!gQTV}Th9%vfN?0y7qvvA~D5fY@NbH<7V4
zrJ%mEr`D!+NPDLg6rR2TNMViyn&y-jr*pjLN_ipH>9|zi0bIazb*}|)sEMGy`$u~S
z6tE2<!tXzr2+_q}?5$G3vRDM#kD-9=F%gb39ZPl*<}xF0F^M4dhHxeJ9T-P|lWE6;
z5{aE15eoT&?gLTa>@@W)wf(t?_zEt#(p%hf0t>b?L{RyQJuF}pA0BxZ<%3-X3Xdp@
z_--~VZsm%&(i682Mf_T(<Mx<{b9HcT=>zI}>UN36PKl{xSkG`1!!Cxs40kfz%kW`_
zgA9imjxZc!s6J+xzfcpw$*`DVCBu4#n;3R6>}9x<p_DTHzv{b_N&-F*dZn|lxurW8
z?RGB5*FlRfcXf-H>oX<AMa88>u1h3drAQ4k(Yp#%MO)bD>!1}$@tK<X4!wmPZ8}T4
zEh&Gt(T@`OXKAYcB;sd_eNQWnt~1e8Rr|BjGSS*qyG_KOs;T~wh(AqJ{U{NS#_hCX
z<aH)aQ*b85=W2VC)NiJir#+k$pKqMoV9g~HU$9nwD-n;~&PpprUT10_Grr4jrDo!;
za_=fDM)A+ra@HlopK16<D>W0|M)}1=e1Vp%UuJ4&8GbC0A6~Z0ijmiumhXXEshP%U
z5+>Gx%Pj5P2@@l)$PHy+x3qpyB8A)|;d^a(st&@d@kXVhdr%JOvp*~`lph7U@8C#c
zhfd&0j{2S~@lx1^pF^8$&)vW$)Bh2{KUd4C#07%N_h;gIu69Z#VNLw2;(D%jY9(P!
z{JY|Mf%ce9e-h}xm-XjV5;y4}!^Moxuat3>?{dbUVZ+l44kTx;4Sy5k=P_Q%zn$>~
zHvCr@e~t}LuPT##rwu>C_{BDSHpWNdFSOwoGk%E;znbyIHoTAVr8fMXjIXfa_cDHk
z4L`*AN*n$i##h_$ABTS={XI53oeMztS{uHN@%1+R=NaE*!+)Rg8*KO&8NbnnpM{E}
zd^g$f^x8k+TW$F38Q*Th-@*6}8=l@#BspC+{Bw+t+VCeB-($mHAnr@d)o!!lYZ%{a
z!{5sI?Kb@VjK9l<{}JQwvEhHq_?<TVoLRD5yKMLp#_zV_H!yyW4gZgf-)qBvhw%^E
z@c+g5eT-N0xjzC=e(A-FH4(&pEL>fJ2_)gwd|S<n(Y=!TrK`d3z&Mh%PRg<VP3Fhe
ziV5oE@#wq@DBsEB=}kiZ<Z+};$T@S;xI_1W@f63TajIX)nLO@10Q?0g@7J!7nwM|^
zPJyA4okMXh@WekXZUSps8}s)v{??@uv4Zih06&+nt>6YsxJgdaB@!#{i{tXkjDNf^
z5kJQGIpQHDJafUqHk>Nuyme_J|8C&(wX?MX+j#ml<}c)YS)7)E1qkVLH<xQQ%YO)p
zIU&bBF8i2&d$D9($oxlvKZ8nKDG}(NBD}%;U$T{p=4Z+5(*-=~^C%Zo@w}FTKNIU3
zG@(i1eBf1DE0NY^b0Pnf#S%-;kWuJN!T&=cKVQ36JcOcYk4bjznp~-8F5CYO#{VDS
zljVCD_%kxj*6j21XIaiWrBZ-u4`<~``PXv0x|rpEhVgd$zXyD>d@C^pA$wMoO9AJx
zoO^*!#{W_Z{-QIaoZZW%fLfNb7x;xJm->D@n!N}IfKMiW4v?hJp$f_8Vg44zKh5n^
z;h#gn=VnZv_dd-0H*>q<Zmc~GJpNA#XX0T;%J-YxZpWA-%6NN!e1!42%-_xYg;;Qt
zobPhIT*Ua#F#ct(N2SA08NZ$J+|9JQ`O?p*@8~PLZ33R;U$;aG=4#V^#CW@Zn0sdO
z@-9h%-wQm+xBHVEY#b2&#uAzDr?^}}#z(ncKEe1yj6ZUjM2P1#aCs#*5>(llRe*KC
zll&LBzbKxE(cq~+Iu}W9RgXVl{?oX<DfxwGC70is0^geg|J4-u9{^8!KF9T2#D$m#
zS4jAstRT9Z2)8ia?%$pPUX_tq1$Yy9@&k7N{0EkE5!;{LjdmS2q$uA#+#cu|Lkh<j
zZ};cxut7!q72MubKRv?uKePTyH`lq6e;wE3aw%TBA9%&XC;?7|14(v{(n8=V-!HKo
zWw))2znl9zyIpOF@%M1Kls#9WJ&>I5GC!KV2)h{nM_YgY7VtFQ*~h5}{CqyvYlC4H
zz~J9SdrQPo%;A*z+WpUZ#=pS%BDn}(0X~_1{vCMIe+L^<@tlW&lJFT^FXd8<_6^`k
zPkVd#9`oDB_jiC-GFgNG^DqcfKhmz0SU(f#DRRoUiuFhL7hwtGbIW8r!1x&BkCaHn
z2F7Qgf0CT1SaGGpCm8=Mw=3@6+8*E)4~r0Be+v9BS<Zbf$*tP$WoR&zZx8FKWbR`8
zEN(yboXM+<e~$Is!uXmCrJPr}UR1mK3gi2kAI(yP4;cRn>r>76d*Sd&pMT(XuG;M=
z@J{Bjf^3Xm#Q!ap54R>lD~)SbM!vp{`R(oPKLvlj*2VRk&HOJgzuiAyf%>L=ALsT4
z^NO&G@gBB=@~dw%-tOP3;rB_-=eS~#oe1|a-pPJa)%yp)Cp(vI9vDc@qs)kE5}^h0
zWOlfd`7_xMi02iN;tk-PGHJr!KQRAwY-esRS{v$#^8F;Ye+=s)bOG<=Yb&^$`ETNU
zxp`^7W&B-$6qL?u!9eou?cwXdD;}AG|N6qwNVFSo()u*>Tsb}3sr%?;xd_g9Y7Oaa
z9ibL)hu(@8pCY=qyGQedI=edjQGaVuMR7UKm`cW^<D95~?hS{%+jM_08s3Iuq`aMe
zy|ufua~qOaVt9ip5KUn9b@*`-9S*39(y?>&UY1U8Viozix^S#lW3<+_yer`M`L_ll
ze%%)e;$X9GA5Pm6q^07uEHAW$%r=QE53~hCVVqe-92L4A2vMGhu3L^X_JZAXgk5ck
z3nlSy_I7tfb-dVhi{2IL2>7<iv)O80#i04ag2FkuP%J$fvZ9K;t)VWNoTTG`voajY
z=kJkPdV|qOZCSvFqu@wtQgV4P6zOh>q@-6sexMTvaHZr}<_mRn;0Ujj46b7Ls=BI1
zUpUYeP03Uy4!uiBuJHFnaju!aH6_PV@77yV5=+~{{;rhNGXIohSBXw_vn?3v40t<I
zN^vdir1SDrot~607>epGp_9uaKL20I*Am+LSLN%Uf^WGjhBqjhV$w$ADwRB>TClrg
ziZYZ-Ms%7ej-U;7r*3F2X6*|5x9Hw2I3q4~NlG_|!Vr9NQx`L1WQvC4QqqDUbepaz
zO1+GwZ4QJZQ9Tswn1XkyIPww#PtrGB%op9(HAN39kzD?W&pQ<xXWD|3_QL+nsqM#^
zwuHfWQm!s<*dIJ87s;BU0Hv}3IB4=DjI3JN8*H0G9CJmwrp&a=$P~Tb?@jG|T;*H~
zVx+-Ygi}<k%9@TdWUIi#xB8=Ku^1@yMvO08aAaYt7*PCrK)9?@IC~yoXefXm349vE
z4NhD2;<&_Cy!=Z;3dM^90Zm_1yMAR=Es}J%=ry`H?NHZOH&oTRb@#ezI-6C;VavMP
z!$?ncgQi!1YF$-b%_`6mM!42QggD*R$YK5J)s1ePuv)dU*3IXTy2{NFjZbIA*`BG-
z?o4_7rrYhNQH-Vean@$JUXPPCJNcx?%1Uu^BMxDtqZh@Ar0}$1KL$e#C%U($ema>V
z6R3-9+-{s3+oU(5N-FdQ91Y7uV2NH$<_>M6$wf_VOR?LHQCU<$XxsGibi22Rj6@;+
z9*3l|>C~Z@m=s}jpJpXhmxz;Z$<deS*NGR-#p$urs&Zv|K2A5iLUCX%>leU`P;d1{
zy+%IMEc_}_ct43Unn${9V49QERhpVJK-bcsf2%xqm!^-?vbVd!=w6-mYw46;%M_Ci
z_{BW56SL_64saHkOs`Kx%jK5{YPffmqj$McA{?)~4E?HRJ)9fPbJe>s(0;^rG>*JY
zc6@O{zaT}{TFm{?TBDRsT&A96KR~&%(sClLq^d)0?H+I%{!Ugxsh&`BI^dT4>GaOB
zP7oz)lMeLm4jLnu9|sI$rX-!GYpJ41B>c$I&d?T&e1B7$bHi~f7rshwT*>ZRHN<I{
zijEi;<HGW;?npa)90qVS;4+dt#hK+cbLg#+klv0_n@&MTx(YR%Ot1fLsc#Y`v$WH6
zxP^p)w;sJ#9Aj<C<fH8jaI5SZ{)S`G6AUk#G>s=|X<SecMv;vhrbu?Ae9CzZd7zI}
zH<!}QQeWw46w8gh3-aUF;s-BI4Y0avJ?2VU^(Tw4Y$(DO#mNp}|4@gN)5vGDSGnjq
zu3~+a+D(~O@3~fVXG;mhCPzA41W$?@qf0vj!J1CAW=!x(=)bMad?-t#YB_5gjRE#=
z@oSiH)Zv76KlYM3uw=u8z$dhw=G=%!TI+=3BWQ*26ZUpU<92#1Qqk${(Id2bMN=a<
z4=lsHt*v2yBof*@ttRJ2ccF!h5Lbfr%H&}YE0=V+Da@|OI&vXXMQfP`3}RTzxM>X)
z2y(YFmQZLt>1h3~GAv^;mopY)AH@nm%)G^939CJ<x$R|XjCMq7Qaa%VGO^e*)I(U!
z`MsSrSjHCXZkViLZR%dsSgONmkOkc~pClJi>0AUQTdx?4t+dpW3-zQH%x+p+W2sB4
zUD-fk1#=BT)6;3Scr|OXtP|6?Ltq_~%Jgf+e2hjVu~}e@O#1b$9rcNWi0FW&G|{YS
zNL}42+mGA5p3GAo_KUeH#yvl_xqN=KB6q3Sj9U*w3ik=F^zDe)$}0x2$)y!-AL_Y$
z?YJ|+yD&aCwooRudLI;q6!`jt_6*H~TRZKjb&a8ph`WAuLu05>PwW7=Vd4*U`IEVn
zbe7*-uTl-$XQk;FUof>3>)^CDezPCZT3&Vq+R(#L>ubM7-;B*a(|?s>5!h6>N^RAN
z8!zeGTEQ4rr*;keZehVRdHrRRJ~LQjc1Y_qAlc%9oX;|NRhmnxev`&R8=}NsimYPH
z&M*LXdT;UTa>2P7+a3NMpC7lxjK$}4d;|3|+Y(lERE#fjxz-q^djVU$;ntd|2Aiq8
z52g{x9T40Odm#2cM$!Lf7mlTR9d<g6EhD)bOB18;UpLeSCJnWz%{%o-i_HmNs9O#C
zawAi?baOpDy{#&2{L*GDW&t!)!i0dWXs^)Jice8%U0uqzq-FS~y9Kz1)*cRRHHUFD
zzGZ=66!#>tZzoo4xQ|=YQrqHki?U#9JgId~t1zu<R*V~$J+?KD*jyzChfj@)Mb(F0
zBqv$)@dPU!mQzcU^f*4!mG|w!eo-5Vdv@p_Mt7f<rQs1%b#~0|<#a=uIUUAvVTw!7
zNTiq;PTjurmPo`>@6%j5N{iCKR}lzB#O;LA8r=7q)GA>1jzEh~_eHwJT`O1t)0TQ3
zz|&$4nj;&_K(X3HCKx(O?Q6G5W5`Ew2MX55m<3bd*$&~)r{(2v%MKn+l;dyom+UOL
z`Qs4_9XH>6?TH;A$%arO1J!goDyyu;-xdf;A4`5NjYHpZ41I0>sNU42CQqSoI-U7a
z*iOiU+Gtv;^XXpQHNBgPm>#R8mt6lR+{jGZC}7;T5YyB&u5`@37uD%3Q^?N1G>u|3
zGd7RdJ94jU)RW!pk~;#fvev+6Uf$_qX$OVedZR~7Q`5Z1Do3W8CYopAQ7lZD{9&vG
z==ruLZ;QBRkZ@Bf9V;Hw((s$j!L0%G;s$@DyVH*~$*NWQ(xT$@jV^L|WqNIT@J>ok
zV{cXCdzvd}nYzo5#v^yWg$woHI=#l|Zb3tuG#R7yxV&4HX6@CWiagWdYfFuNQ8@2(
zF7Zt((?b|)Cr4O%sn6K6G^YG%8ae_!v=)}_Qp{V$V>9NW+<3+WwPHBuP|MAD-~k)9
zP~yYRaB0^?IcT~CQ4cTOARaT(+dA<ek5obq(yofh+r;VIB)0Al!_c%@)`*<R4W)2G
zjnSb0qiOYMOn;YQJI%ez+}=!_vXU!G+(y%iBHKEn-WI@USi*L7MGLoZmsW&ToxccA
z`WAJCv8fi0ZnL0Tx&s}pmj_xIRkgC_@~F2>6YTBYNV`_lx-Ez-B#ee7Ws5%?!Si7W
z5gjzNWiKV*tFDfyRzx@IwIW;=wT1X1;`eDq*k?ync<iEZNIdjX<ZtJPquW~{O~sIn
z&!_WV3d)FqOGuT^O-mW-8ku`L0}w9-VgaEQ;hF7DJcpg|^!KzQ^j#VUei<yGAX8o_
zh}U}(0`Xb{J^w!&Kb5|M(<|t-GUGbO7N^JRt?4&#dIi<<{3=<Nx?osiORv8Fp`g>o
zZ%_Y8&~cPQNa?S>x1%6Gu5WyA$F9E@=}*H?r9aC0E4Yc{R7SF$O0VA62mnVcD!ux?
zkAmuZK41~SZvR_>rEle^^z{BK1qD4aYUr={6#NY6^z9uLSKl#Fa4)AP{Yg)yKclq)
zT#!vwdiA{%1$$ZHS)4$nSLMH-)30Q@`Ywus>U%_#{w)0L=^p@&vQY9hCn6M_;xFES
zN(}b;-vhEWz0)cA6;%2wDR>h~1O@*EF>8ADJr)HOA1A{b9Wp5Rur0m%Zi|A}GdL)v
zN~hpsw)E<KM+KYlKBYCNbPD$*F0A^i@4zUC*OU{2-F`pj^s4@~N@i#HK1UT$^{3)L
z#RcWB(!XCRDGJ`B2oi04M}(u=i@30+SKpse@UBx8r-b(WU$doG->Fg1?q==f{~x5s
ztJb1#P!8&QU&9+IX4aRCneA8UkqPDLT`zVM#`gzCIKARal|B#YsqL%u>ialxl|S2$
z`Y#0~Pql+@14s5#>DBkZj^9Mw;3uLbsPqaJfNo81yf3*)CQ|)eiBRcO{5&KkoXWql
zi_>kAu-8`ql@e|IUBvkp8yI;}ut`FGt;(R1t`w}$mW7~uHVL57d(#x?{XHE&rN1iI
oDZnY+acP8o-STosz39ahm9Jtv0~g8ChrTX3=u8`Hutl~12iOU-{r~^~

diff --git a/test_scripts/ofccl/run.sh b/test_scripts/ofccl/run.sh
deleted file mode 100755
index f7158da..0000000
--- a/test_scripts/ofccl/run.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
-export NCCL_PROTO=Simple
-export NCCL_ALGO=Ring
-
-export DATE=221221
-export OF_ORDER=1
-
-export TRAVERSE_TIMES=10
-export TOLERANT_UNPROGRESSED_CNT=10000
-export BASE_CTX_SWITCH_THRESHOLD=80
-export BOUNS_SWITCH_4_PROCESSED_COLL=0
-export DEV_TRY_ROUND=10
-
-# export SHOW_ALL_PREPARED_COLL=1
-
-for MY_NUM_DEV in 2 4 8
-do
-    unset CUDA_VISIBLE_DEVICES
-    if [ $MY_NUM_DEV = 4 ]; then
-        export CUDA_VISIBLE_DEVICES=0,1,4,5
-    fi
-    export RES_DIR=test_result_${DATE}_${OF_ORDER}_${MY_NUM_DEV}cards
-    if [ ! -d "$RES_DIR" ]; then 
-        mkdir $RES_DIR
-    fi
-
-    for n in 32
-    do
-        for w in  2 
-        do
-            for m in 1
-            do
-                for iter in 1 2 3 
-                do
-                export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
-                ## Time
-                echo $(date +%F%n%T)>> $RES_PATH
-                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M #32M 64M 128M 256M 512M 1G
-                    do
-                    ## Test
-                    /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH
-                    done
-                done 
-            done
-        done
-    done
-done
diff --git a/test_scripts/ofccl/static.sh b/test_scripts/ofccl/static.sh
deleted file mode 100755
index 3a65584..0000000
--- a/test_scripts/ofccl/static.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-g++ statics_ofccl.cpp -o statics_ofccl.out
-
-g++ statics_totalCtx.cpp -o statics_totalCtx.out 
-export RES_DIR=test_result_221120_2cards
-export OUTPUT_PATH="./$RES_DIR/result_statics_all.txt" 
-echo  $(date +%F%n%T)>>$OUTPUT_PATH
-for n in 4
-do
-  for w in 2
-  do
-    for M in 4
-    do
-      for iter in 1 2 3   
-      do
-        export INPUT_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_M"$M".txt"
-        ./statics_ofccl.out $INPUT_PATH $OUTPUT_PATH
-        ./statics_totalCtx.out $INPUT_PATH $OUTPUT_PATH        
-      done 
-    done
-  done
-done 
diff --git a/test_scripts/ofccl/static_time.cpp b/test_scripts/ofccl/static_time.cpp
deleted file mode 100644
index c079845..0000000
--- a/test_scripts/ofccl/static_time.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#include"bits/stdc++.h"
-#include <sstream>
-using namespace std;
-int main(int argc,char* argv[]){
-    //cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]<<endl;
-    
-    freopen(argv[1],"r",stdin);
-    freopen(argv[2],"a",stdout);
-     cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]<<endl;
-    string inputLine;
-    vector<double> a;
-    vector<double> b;
-    string ss="bandwidth";
-    string str = "N/A";
-    while(getline(cin, inputLine)){
-        if (inputLine.find(str,0)  == -1)
-            continue;
-
-        stringstream line;
-        line << inputLine;
-        double tmp;
-        line >> tmp;
-      
-        a.push_back(tmp);
-    }
-    cout << argv[1]<<" time: "<<endl;
-    for(auto a1:a)
-        cout<<a1<<endl;
-    cout<<"************"<<endl;
-   
-        
-}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_time.sh b/test_scripts/ofccl/static_time.sh
deleted file mode 100755
index 41aa407..0000000
--- a/test_scripts/ofccl/static_time.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-g++ static_time.cpp -o static_time.out
-
-
-export RES_DIR=test_result_221119
-export OUTPUT_PATH="./$RES_DIR/result_statics_all_time.txt" 
-echo  $(date +%F%n%T)>>$OUTPUT_PATH
-for n in 4
-do
-  for w in 2
-  do
-    for M in 4
-    do
-      for iter in 1 2 3   
-      do
-        export INPUT_PATH="./$RES_DIR/test_result_"$iter"_n"$n"_w"$w"_M"$M".txt"
-        ./static_time.out $INPUT_PATH $OUTPUT_PATH
-     
-      done 
-    done
-  done
-done 
diff --git a/test_scripts/ofccl/statics_ofccl.cpp b/test_scripts/ofccl/statics_ofccl.cpp
deleted file mode 100644
index 462fffe..0000000
--- a/test_scripts/ofccl/statics_ofccl.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-#include"bits/stdc++.h"
-#include <sstream>
-using namespace std;
-int main(int argc,char* argv[]){
-    //cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]<<endl;
-    
-    freopen(argv[1],"r",stdin);
-    freopen(argv[2],"a",stdout);
-     cout << "bandwidth"<<" "<< argv[1]<<" "<< argv[2]<<endl;
-    string inputLine;
-    vector<double> a;
-    vector<double> b;
-    string ss="bandwidth";
-    string str = "N/A";
-    while(getline(cin, inputLine)){
-        if (inputLine.find(str,0)  == -1)
-            continue;
-
-        stringstream line;
-        line << inputLine;
-        double tmp;
-        line >> tmp;
-        line >> tmp;
-        a.push_back(tmp);
-        line >> tmp;
-        b.push_back(tmp);
-    }
-    cout << argv[1]<<" algbw: "<<endl;
-    for(auto a1:a)
-        cout<<a1<<endl;
-    cout<<"************"<<endl;
-    cout << argv[1]<<" busbw: "<<endl;
-    for(auto b1:b)
-        cout<<b1<<endl;
-        
-}
\ No newline at end of file

From 23a25eaac117ebbd8b721b869e5274b11d8ef78a Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 23 Dec 2022 06:50:07 +0000
Subject: [PATCH 075/109] =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=91=BD=E5=90=8Dbug?=
 =?UTF-8?q?=E4=BF=AE=E5=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 06799ae..9eb5383 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -11,13 +11,13 @@
 # 设置超参数
 # run
 DATE="221222"
-runNcclTest = False # 运行nccl测试
+runNcclTest = True # 运行nccl测试
 collectNcclResult  = True  # 统计nccl测试结果，写入xls
-runOfcclTest = False# 运行ofccl测试
+runOfcclTest = True# 运行ofccl测试
 collectOfcclResult = True # 统计ofccl测试结果，写入xls
 
 NCCL_ORDER="1"
-resultXlsName="result_"+DATA+"_"+NCCL_ORDER+".xls"
+resultXlsName="result_"+DATE+"_"+NCCL_ORDER+".xls"
 n = 2
 m = 3 #nccl
 w = 2

From 9daa76c9f20fda4b3c7a3e9752f6d54aa4271063 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 23 Dec 2022 08:36:57 +0000
Subject: [PATCH 076/109] =?UTF-8?q?deltaSec=E7=9C=8B=E8=B5=B7=E6=9D=A5?=
 =?UTF-8?q?=E7=BB=9F=E8=AE=A1=E5=BE=97=E5=81=8F=E5=A4=A7=E4=BA=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 nccl_test.sh                |  2 +-
 ofccl_test.sh               |  6 +++---
 src_simple/common_simple.cu | 27 ++++++++++++++++++++++-----
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/nccl_test.sh b/nccl_test.sh
index b5ca1d9..76cd861 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -19,7 +19,7 @@ if [ "$BINARY" == "DEBUG" ];then
     export MY_NUM_DEV=8
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=0
-    export NITER=16
+    export NITER=8
     export NBYTES=8K
     export WARMITER=2
     export MITER=1
diff --git a/ofccl_test.sh b/ofccl_test.sh
index 073c8d0..2070999 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -45,7 +45,7 @@ if [ "$BINARY" == "DEBUG" ];then
     export MY_NUM_DEV=8
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=0
-    export NITER=8
+    export NITER=5
     export NBYTES=8K
     export WARMITER=2
     export MITER=1
@@ -55,10 +55,10 @@ elif [ "$BINARY" == "PERF" ];then
     export MY_NUM_DEV=8
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=0
-    export NITER=1
+    export NITER=8
     export NBYTES=8K
     export WARMITER=2
-    export MITER=16
+    export MITER=1
     export CHECK=0
 elif [ "$BINARY" == "MS" ];then
     target="./build/ofccl_all_reduce_ms_perf"
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 6701244..bac1c31 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -800,12 +800,22 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
   size_t count = args->nbytes / wordSize(type);
 
+  // Sync，参考nccl，把这个也加上吧。
+  for (int miter = 0; miter < multi_iters; miter++) {
+    seenCqe[miter] = 0;
+    TESTCHECK(startColl(args, type, op, root, in_place,
+                        0 * multi_iters + miter, miter, rankCtx));
+  }
+  TESTCHECK(completeColl(args));
+
   Barrier(args);
 
   // Performance Benchmark
   auto start = std::chrono::high_resolution_clock::now();
   for (int iter = 0; iter < iters; iter++) {
 
+    auto iter_start = std::chrono::high_resolution_clock::now();
+
     for (int miter = 0; miter < multi_iters; miter++) {
       seenCqe[miter] = 0;
       TESTCHECK(startColl(args, type, op, root, in_place,
@@ -814,9 +824,15 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
     TESTCHECK(completeColl(args));
 
-    // int cudaDev;
-    // cudaGetDevice(&cudaDev);
+    auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start;
+    double iter_deltaSec =
+      std::chrono::duration_cast<std::chrono::duration<double>>(iter_delta).count();
+
+    int cudaDev;
+    cudaGetDevice(&cudaDev);
     // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+    if (cudaDev == 0)
+      OFTEST_LOG(TEST, "Rank<%d>, iter=%d, time = %lfus", cudaDev, iter, iter_deltaSec * 1.0E6);
   }
 
   auto delta = std::chrono::high_resolution_clock::now() - start;
@@ -825,9 +841,10 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   deltaSec = deltaSec / (iters * multi_iters);
   if (cudaGraphLaunches >= 1)
     deltaSec = deltaSec / cudaGraphLaunches;
-  // int cudaDev;
-  // cudaGetDevice(&cudaDev);
-  // OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters);
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+  if (cudaDev == 0)
+    OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters);
 
   Allreduce(args, &deltaSec, average);
 

From 1f58d485744e2e6be6cfee389c05ab41094aed81 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 23 Dec 2022 08:59:46 +0000
Subject: [PATCH 077/109] meaningless NEW_TIMER

---
 src_simple/common_simple.cu | 45 ++++++++++++++++++++++++-------------
 src_simple/common_simple.h  |  2 ++
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index bac1c31..884fdba 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -811,10 +811,17 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   Barrier(args);
 
   // Performance Benchmark
-  auto start = std::chrono::high_resolution_clock::now();
+  #ifdef NEW_TIMER
+    double deltaSec = 0.0;
+  #else
+    auto start = std::chrono::high_resolution_clock::now();
+  #endif
+
   for (int iter = 0; iter < iters; iter++) {
 
-    auto iter_start = std::chrono::high_resolution_clock::now();
+    #ifdef NEW_TIMER
+      auto iter_start = std::chrono::high_resolution_clock::now();
+    #endif
 
     for (int miter = 0; miter < multi_iters; miter++) {
       seenCqe[miter] = 0;
@@ -823,21 +830,29 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
     }
 
     TESTCHECK(completeColl(args));
-
-    auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start;
-    double iter_deltaSec =
-      std::chrono::duration_cast<std::chrono::duration<double>>(iter_delta).count();
-
-    int cudaDev;
-    cudaGetDevice(&cudaDev);
-    // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
-    if (cudaDev == 0)
-      OFTEST_LOG(TEST, "Rank<%d>, iter=%d, time = %lfus", cudaDev, iter, iter_deltaSec * 1.0E6);
+    
+    #ifdef NEW_TIMER
+      auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start;
+      double iter_deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(iter_delta).count();
+      
+      int cudaDev;
+      cudaGetDevice(&cudaDev);
+      // OFTEST_LOG(TEST, "<%lu> Rank<%d>, done %dth BenchTime iter for %d multi_iters", pthread_self(), cudaDev, iter, multi_iters);
+      if (cudaDev == 0)
+        OFTEST_LOG(TEST, "Rank<%d>, iter=%d, time = %lfus", cudaDev, iter, iter_deltaSec * 1.0E6);
+    #endif
+
+    #ifdef NEW_TIMER
+      deltaSec += iter_deltaSec;
+    #endif
   }
 
-  auto delta = std::chrono::high_resolution_clock::now() - start;
-  double deltaSec =
-      std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  #ifndef NEW_TIMER
+    auto delta = std::chrono::high_resolution_clock::now() - start;
+    double deltaSec =
+        std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+  #endif
+
   deltaSec = deltaSec / (iters * multi_iters);
   if (cudaGraphLaunches >= 1)
     deltaSec = deltaSec / cudaGraphLaunches;
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index 406f634..c80dfa9 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -18,6 +18,8 @@
 
 // #define DEBUG_PRINT 1
 
+// #define NEW_TIMER 1
+
 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
 #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
 #define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__)

From fc09438e79c926434fd9cd77b1b88afd9859a923 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 23 Dec 2022 12:01:05 +0000
Subject: [PATCH 078/109] check frequency

---
 nccl_test.sh                | 12 ++++++---
 ofccl_test.sh               | 16 ++++++++----
 src_simple/common_simple.cu | 50 ++++++++++++++++++++++++++++++++-----
 src_simple/common_simple.h  |  3 +++
 4 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/nccl_test.sh b/nccl_test.sh
index 76cd861..89ba9a8 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -17,7 +17,9 @@ fi
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/all_reduce_perf"
     export MY_NUM_DEV=8
-    # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=8
     export NBYTES=8K
@@ -27,7 +29,9 @@ if [ "$BINARY" == "DEBUG" ];then
 elif [ "$BINARY" == "PERF" ];then
     target="./build/all_reduce_perf"
     export MY_NUM_DEV=8
-    # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=4
     export NBYTES=8K
@@ -37,7 +41,9 @@ elif [ "$BINARY" == "PERF" ];then
 elif [ "$BINARY" == "MS" ];then
     export MY_NUM_DEV=8
     # target="./build/ofccl_all_reduce_ms_perf"
-    # # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
     # export NITER=200
     # export SHOW_ALL_PREPARED_COLL=1
     # export WARMITER=0
diff --git a/ofccl_test.sh b/ofccl_test.sh
index 2070999..2745011 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -42,18 +42,22 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=8
-    # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    export MY_NUM_DEV=4
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=8K
+    export NBYTES=256
     export WARMITER=2
     export MITER=1
     export CHECK=0
 elif [ "$BINARY" == "PERF" ];then
     target="./build/ofccl_all_reduce_perf"
     export MY_NUM_DEV=8
-    # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=8
     export NBYTES=8K
@@ -63,7 +67,9 @@ elif [ "$BINARY" == "PERF" ];then
 elif [ "$BINARY" == "MS" ];then
     target="./build/ofccl_all_reduce_ms_perf"
     export MY_NUM_DEV=8
-    # export CUDA_VISIBLE_DEVICES=0,1,4,5
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
     export NITER=200
     export SHOW_ALL_PREPARED_COLL=1
     export WARMITER=0
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 884fdba..b7ef3ab 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -103,6 +103,28 @@ static int average = 1;
 static thread_local CallBackArgs cbArgList[MAX_COLL_NUM];
 static thread_local int seenCqe[MAX_COLL_NUM];
 
+// bool StringToInteger(const std::string& str, int64_t* value) {
+//   char* end;
+//   int64_t v = std::strtoll(str.data(), &end, 10);
+//   if (end == str.data()) {
+//     return false;
+//   } else {
+//     *value = v;
+//     return true;
+//   }
+// }
+
+// static int64_t ParseIntegerFromEnv(const std::string& env_var, int64_t default_value) {
+//   const char* env_p = std::getenv(env_var.c_str());
+//   if (env_p == nullptr) { return default_value; }
+//   int64_t value;
+//   if (StringToInteger(env_p, &value)) {
+//     return value;
+//   } else {
+//     return default_value;
+//   }
+// }
+
 static double parsesize(const char *value) {
   long long int units;
   double size;
@@ -810,6 +832,9 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
   Barrier(args);
 
+  // int64_t NEW_TIMER = ParseIntegerFromEnv("NEW_TIMER", 0);
+  // int64_t SHOW_ITER_TIME = ParseIntegerFromEnv("SHOW_ITER_TIME", 0);
+
   // Performance Benchmark
   #ifdef NEW_TIMER
     double deltaSec = 0.0;
@@ -819,7 +844,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
   for (int iter = 0; iter < iters; iter++) {
 
-    #ifdef NEW_TIMER
+    #if defined(NEW_TIMER) || defined(SHOW_ITER_TIME)
       auto iter_start = std::chrono::high_resolution_clock::now();
     #endif
 
@@ -831,7 +856,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
 
     TESTCHECK(completeColl(args));
     
-    #ifdef NEW_TIMER
+    #if defined(NEW_TIMER) || defined(SHOW_ITER_TIME)
       auto iter_delta = std::chrono::high_resolution_clock::now() - iter_start;
       double iter_deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(iter_delta).count();
       
@@ -856,10 +881,23 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
   deltaSec = deltaSec / (iters * multi_iters);
   if (cudaGraphLaunches >= 1)
     deltaSec = deltaSec / cudaGraphLaunches;
-  int cudaDev;
-  cudaGetDevice(&cudaDev);
-  if (cudaDev == 0)
-    OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters);
+  
+  #ifdef SHOW_AVG_TIME
+    int cudaDev;
+    cudaGetDevice(&cudaDev);
+    if (cudaDev == 0)
+      OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters);
+
+    // int clockRate;
+    // cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, cudaDev);
+    // int memoryClockRate;
+    // cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, cudaDev);
+    // OFTEST_LOG(TEST, "Rank<%d>, clockRate = %d, memoryClockRate = %d", cudaDev, clockRate, memoryClockRate);
+
+    // cudaDeviceProp prop;
+    // cudaGetDeviceProperties(&prop, cudaDev);
+    // OFTEST_LOG(TEST, "Rank<%d>, prop.clockRate = %d, prop.memoryClockRate = %d", cudaDev, prop.clockRate, prop.memoryClockRate);
+  #endif
 
   Allreduce(args, &deltaSec, average);
 
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index c80dfa9..1e61943 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -16,9 +16,12 @@
 #include <pthread.h>
 #include "nccl1_compat.h"
 
+// 环境变量是方便，但是会多一些判断，可能影响性能。
 // #define DEBUG_PRINT 1
 
 // #define NEW_TIMER 1
+#define SHOW_ITER_TIME 1
+#define SHOW_AVG_TIME 1
 
 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
 #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)

From 266d3c8099340e63ecb9bd2861c1c03102d83aa2 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 23 Dec 2022 12:56:02 +0000
Subject: [PATCH 079/109] update xls name and ndev

---
 ofccl_test.sh             |  6 +++---
 test_scripts/auto_test.py | 19 +++++++++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 3465366..c63c780 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -42,11 +42,11 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=2
+    export MY_NUM_DEV=8
     # export CUDA_VISIBLE_DEVICES=0,1,4,5
     export SHOW_ALL_PREPARED_COLL=0
-    export NITER=16
-    export NBYTES=8K
+    export NITER=8
+    export NBYTES=128K
     export WARMITER=2
     export MITER=1
     export CHECK=0
diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 9eb5383..3edf8b2 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -17,12 +17,19 @@
 collectOfcclResult = True # 统计ofccl测试结果，写入xls
 
 NCCL_ORDER="1"
-resultXlsName="result_"+DATE+"_"+NCCL_ORDER+".xls"
-n = 2
-m = 3 #nccl
+host=os.environ.get("HOST")
+n = 8
+m = 1 #nccl
 w = 2
-M = 3 #ofccl
-NUM_DEV = 4#设备的卡数，实验用到的卡数写在循环里
+M = 1 #ofccl
+if host=="oneflow-15" or host=="oneflow-16":
+    NUM_DEV = 4#设备的总卡数，实验用到的卡数写在循环里
+    ncards = [2,4]
+else:
+    NUM_DEV = 8
+    ncards = [2,4,8]
+
+resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+m+"_n"+n+"_w"+w+".xls"
 
 # static 
 os.system("g++ ./nccl/static_nccl.cpp -o ./nccl/static_nccl.out")
@@ -36,7 +43,7 @@
 bwSheet = table.add_sheet('bw')
 tmSheet = table.add_sheet('time')
 cnt  = 0
-for MY_NUM_DEV in [2,4]:
+for MY_NUM_DEV in ncards:
 
     if 'CUDA_VISIBLE_DEVICES' in os.environ:
         del os.environ['CUDA_VISIBLE_DEVICES']

From 4702178642bf91ab7641f68ede153dcac18e5c02 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 23 Dec 2022 13:09:55 +0000
Subject: [PATCH 080/109] add log

---
 ofccl_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 2745011..6bc92ba 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -42,7 +42,7 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=4
+    export MY_NUM_DEV=8
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi

From 5f7b4bfdcc98c24359dd8e46e769b7fe9060dc67 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 23 Dec 2022 13:30:56 +0000
Subject: [PATCH 081/109] update env

---
 test_scripts/auto_test.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 3edf8b2..a99abab 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -5,12 +5,18 @@
 os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib"
 os.environ['NCCL_PROTO'] = "Simple"
 os.environ['NCCL_ALGO'] = "RING"
+
+os.environ['TRAVERSE_TIMES'] = "10"
+os.environ['TOLERANT_UNPROGRESSED_CNT'] = "10000"
+os.environ['BASE_CTX_SWITCH_THRESHOLD'] = "80"
+os.environ['BOUNS_SWITCH_4_PROCESSED_COLL'] = "0"
+os.environ['DEV_TRY_ROUND'] = "10"
 # test
 # f = os.popen("./nccl/run.sh")
 # print(f.readlines())
 # 设置超参数
 # run
-DATE="221222"
+DATE="221223"
 runNcclTest = True # 运行nccl测试
 collectNcclResult  = True  # 统计nccl测试结果，写入xls
 runOfcclTest = True# 运行ofccl测试
@@ -29,7 +35,7 @@
     NUM_DEV = 8
     ncards = [2,4,8]
 
-resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+m+"_n"+n+"_w"+w+".xls"
+resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+str(m)+"n"+str(n)+"w"+str(w)+".xls"
 
 # static 
 os.system("g++ ./nccl/static_nccl.cpp -o ./nccl/static_nccl.out")

From 5ced1a08f2681f3dd90f6f63f1d70a68cad37bc0 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 24 Dec 2022 04:59:06 +0000
Subject: [PATCH 082/109] +nccl ofccl run.sh

---
 .gitignore                      |  2 +-
 nccl_test.sh                    |  4 +--
 ofccl_test.sh                   |  4 +--
 src_simple/common_simple.h      |  4 +--
 test_scripts/nccl/run_nccl.sh   | 42 +++++++++++++++++++++++++++
 test_scripts/ofccl/run_ofccl.sh | 50 +++++++++++++++++++++++++++++++++
 6 files changed, 99 insertions(+), 7 deletions(-)
 create mode 100755 test_scripts/nccl/run_nccl.sh
 create mode 100755 test_scripts/ofccl/run_ofccl.sh

diff --git a/.gitignore b/.gitignore
index 81a260f..0eba5f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,6 @@
 
 .vscode
 
-test_result*/
+*_result*/
 *.xls
 *.out
\ No newline at end of file
diff --git a/nccl_test.sh b/nccl_test.sh
index 89ba9a8..e96be13 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -16,13 +16,13 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/all_reduce_perf"
-    export MY_NUM_DEV=8
+    export MY_NUM_DEV=2
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=8
-    export NBYTES=8K
+    export NBYTES=64
     export WARMITER=2
     export MITER=1
     export CHECK=0
diff --git a/ofccl_test.sh b/ofccl_test.sh
index 6bc92ba..2dd6284 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -42,13 +42,13 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=8
+    export MY_NUM_DEV=2
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=256
+    export NBYTES=128
     export WARMITER=2
     export MITER=1
     export CHECK=0
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index 1e61943..e8ef280 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -20,8 +20,8 @@
 // #define DEBUG_PRINT 1
 
 // #define NEW_TIMER 1
-#define SHOW_ITER_TIME 1
-#define SHOW_AVG_TIME 1
+// #define SHOW_ITER_TIME 1
+// #define SHOW_AVG_TIME 1
 
 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
 #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh
new file mode 100755
index 0000000..58c26fd
--- /dev/null
+++ b/test_scripts/nccl/run_nccl.sh
@@ -0,0 +1,42 @@
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
+
+export DATE=221224
+export NCCL_ORDER=1
+
+for MY_NUM_DEV in 2 4 8
+do
+    unset CUDA_VISIBLE_DEVICES
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export RES_DIR=run_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
+    if [ ! -d "$RES_DIR" ]; then 
+        mkdir $RES_DIR
+    fi
+
+    for n in 8
+    do
+        for w in  2 
+        do
+            for m in 1
+            do
+                for iter in 1
+                do
+                export RES_PATH="./$RES_DIR/nccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
+                ## Time
+                echo $(date +%F%n%T)>> $RES_PATH
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G
+                    do
+                    ## Test
+                    /home/panlichen/work2/nccl-tests/build/all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -m $m >> $RES_PATH
+                    done
+                done 
+            done
+        done
+    done
+done
diff --git a/test_scripts/ofccl/run_ofccl.sh b/test_scripts/ofccl/run_ofccl.sh
new file mode 100755
index 0000000..1f6c486
--- /dev/null
+++ b/test_scripts/ofccl/run_ofccl.sh
@@ -0,0 +1,50 @@
+export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
+export NCCL_PROTO=Simple
+export NCCL_ALGO=Ring
+export NCCL_MAX_NCHANNELS=1
+export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
+
+export DATE=221224
+export NCCL_ORDER=1
+
+export TRAVERSE_TIMES=10
+export TOLERANT_UNPROGRESSED_CNT=10000
+export BASE_CTX_SWITCH_THRESHOLD=80
+export BOUNS_SWITCH_4_PROCESSED_COLL=0
+export DEV_TRY_ROUND=10
+
+# export SHOW_ALL_PREPARED_COLL=1
+
+for MY_NUM_DEV in 2 4 8
+do
+    unset CUDA_VISIBLE_DEVICES
+    if [ $MY_NUM_DEV = 4 ]; then
+        export CUDA_VISIBLE_DEVICES=0,1,4,5
+    fi
+    export RES_DIR=run_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
+    if [ ! -d "$RES_DIR" ]; then 
+        mkdir $RES_DIR
+    fi
+
+    for n in 5
+    do
+        for w in  2 
+        do
+            for m in 1
+            do
+                for iter in 1
+                do
+                export RES_PATH="./$RES_DIR/ofccl_result_"$iter"_n"$n"_w"$w"_m"$m".txt"
+                ## Time
+                echo $(date +%F%n%T)>> $RES_PATH
+                    for a in 64 128 256 512 1K 2K 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G
+                    do
+                    ## Test
+                    /home/panlichen/work2/nccl-tests/build/ofccl_all_reduce_perf -b $a -e $a -f 2 -t $MY_NUM_DEV -g 1 -n $n -w $w -c 0 -M $m >> $RES_PATH
+                    done
+                done 
+            done
+        done
+    done
+done

From 875d1d5753c1bd1664e4504e2432c91c9415baf5 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 25 Dec 2022 11:16:33 +0000
Subject: [PATCH 083/109] report rank 0 avg time

---
 nccl_test.sh                    | 4 ++--
 ofccl_test.sh                   | 4 ++--
 src/common.cu                   | 2 +-
 src_simple/common_simple.cu     | 2 +-
 src_simple/common_simple.h      | 2 +-
 test_scripts/nccl/run_nccl.sh   | 2 +-
 test_scripts/ofccl/run_ofccl.sh | 8 ++++----
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/nccl_test.sh b/nccl_test.sh
index e96be13..4ce69c7 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -21,8 +21,8 @@ if [ "$BINARY" == "DEBUG" ];then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
     export SHOW_ALL_PREPARED_COLL=0
-    export NITER=8
-    export NBYTES=64
+    export NITER=5
+    export NBYTES=4K
     export WARMITER=2
     export MITER=1
     export CHECK=0
diff --git a/ofccl_test.sh b/ofccl_test.sh
index 2dd6284..33982e7 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -42,13 +42,13 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=2
+    export MY_NUM_DEV=8
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=128
+    export NBYTES=64M
     export WARMITER=2
     export MITER=1
     export CHECK=0
diff --git a/src/common.cu b/src/common.cu
index 716362b..f22be54 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -780,7 +780,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
       setupArgs(size, type, args);
       print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
       TESTCHECK(BenchTime(args, type, op, root, 0));
-      TESTCHECK(BenchTime(args, type, op, root, 1));
+      // TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
   }
   return testSuccess;
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index b7ef3ab..52d6be6 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -886,7 +886,7 @@ testResult_t BenchTime(struct threadArgs *args, ncclDataType_t type, ncclRedOp_t
     int cudaDev;
     cudaGetDevice(&cudaDev);
     if (cudaDev == 0)
-      OFTEST_LOG(TEST, "Rank<%d>, time = %lfus, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters);
+      OFTEST_LOG(TEST, "Rank<%d>, time = %lf us, iters * multi_iters = %d", cudaDev, deltaSec * 1.0E6, iters * multi_iters);
 
     // int clockRate;
     // cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, cudaDev);
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index e8ef280..8801172 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -21,7 +21,7 @@
 
 // #define NEW_TIMER 1
 // #define SHOW_ITER_TIME 1
-// #define SHOW_AVG_TIME 1
+#define SHOW_AVG_TIME 1
 
 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
 #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh
index 58c26fd..c8d5ec9 100755
--- a/test_scripts/nccl/run_nccl.sh
+++ b/test_scripts/nccl/run_nccl.sh
@@ -19,7 +19,7 @@ do
         mkdir $RES_DIR
     fi
 
-    for n in 8
+    for n in 5
     do
         for w in  2 
         do
diff --git a/test_scripts/ofccl/run_ofccl.sh b/test_scripts/ofccl/run_ofccl.sh
index 1f6c486..ed99b51 100755
--- a/test_scripts/ofccl/run_ofccl.sh
+++ b/test_scripts/ofccl/run_ofccl.sh
@@ -1,12 +1,12 @@
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
 export NCCL_PROTO=Simple
 export NCCL_ALGO=Ring
-export NCCL_MAX_NCHANNELS=1
-export NCCL_MIN_NCHANNELS=1
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
 # export NCCL_NTHREADS=64
 
-export DATE=221224
-export NCCL_ORDER=1
+export DATE=221225
+export NCCL_ORDER=2
 
 export TRAVERSE_TIMES=10
 export TOLERANT_UNPROGRESSED_CNT=10000

From 386ee920d7de62249adfcc49b08c3331f57b1965 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 25 Dec 2022 11:27:53 +0000
Subject: [PATCH 084/109] scripts

---
 .gitignore                      | 2 +-
 nccl_test.sh                    | 2 +-
 test_scripts/nccl/run_nccl.sh   | 4 ++--
 test_scripts/ofccl/run_ofccl.sh | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0eba5f0..99f99d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,6 @@
 
 .vscode
 
-*_result*/
+*result*/
 *.xls
 *.out
\ No newline at end of file
diff --git a/nccl_test.sh b/nccl_test.sh
index 4ce69c7..b938806 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -22,7 +22,7 @@ if [ "$BINARY" == "DEBUG" ];then
     fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=4K
+    export NBYTES=64M
     export WARMITER=2
     export MITER=1
     export CHECK=0
diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh
index c8d5ec9..7a0de3c 100755
--- a/test_scripts/nccl/run_nccl.sh
+++ b/test_scripts/nccl/run_nccl.sh
@@ -5,7 +5,7 @@ export NCCL_ALGO=Ring
 # export NCCL_MIN_NCHANNELS=1
 # export NCCL_NTHREADS=64
 
-export DATE=221224
+export DATE=221225
 export NCCL_ORDER=1
 
 for MY_NUM_DEV in 2 4 8
@@ -14,7 +14,7 @@ do
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
-    export RES_DIR=run_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
+    export RES_DIR=result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
     if [ ! -d "$RES_DIR" ]; then 
         mkdir $RES_DIR
     fi
diff --git a/test_scripts/ofccl/run_ofccl.sh b/test_scripts/ofccl/run_ofccl.sh
index ed99b51..3be6bf7 100755
--- a/test_scripts/ofccl/run_ofccl.sh
+++ b/test_scripts/ofccl/run_ofccl.sh
@@ -6,7 +6,7 @@ export NCCL_ALGO=Ring
 # export NCCL_NTHREADS=64
 
 export DATE=221225
-export NCCL_ORDER=2
+export NCCL_ORDER=3
 
 export TRAVERSE_TIMES=10
 export TOLERANT_UNPROGRESSED_CNT=10000
@@ -22,7 +22,7 @@ do
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
-    export RES_DIR=run_result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
+    export RES_DIR=result_${DATE}_${NCCL_ORDER}_${MY_NUM_DEV}cards
     if [ ! -d "$RES_DIR" ]; then 
         mkdir $RES_DIR
     fi

From 13567f2f491f9afca09ee4602ddf3f8b60bf6ef9 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 25 Dec 2022 12:29:03 +0000
Subject: [PATCH 085/109] nccl show each kernel time

---
 nccl_test.sh                  |  2 +-
 src/common.cu                 | 76 +++++++++++++++++++++--------------
 src/common.h                  |  2 +
 test_scripts/nccl/run_nccl.sh |  2 +-
 4 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/nccl_test.sh b/nccl_test.sh
index b938806..80a203f 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -22,7 +22,7 @@ if [ "$BINARY" == "DEBUG" ];then
     fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=64M
+    export NBYTES=64
     export WARMITER=2
     export MITER=1
     export CHECK=0
diff --git a/src/common.cu b/src/common.cu
index f22be54..fea29f0 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -735,11 +735,13 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   } else {
     sprintf(timeStr, "%7.2f", timeUsec);
   }
-  if (datacheck) {
-     PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
-  } else {
-     PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
-  }
+  #ifndef NCCL_DEBUG_CLOCK
+    if (datacheck) {
+      PRINT("  %7s  %6.2f  %6.2f  %5.0le", timeStr, algBw, busBw, maxDelta);
+    } else {
+      PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+    }
+  #endif
 
   args->bw[0] += busBw;
   args->bw_count[0]++;
@@ -778,7 +780,10 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
   // Benchmark
   for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
       setupArgs(size, type, args);
-      print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+
+      #ifndef NCCL_DEBUG_CLOCK
+        print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+      #endif
       TESTCHECK(BenchTime(args, type, op, root, 0));
       // TESTCHECK(BenchTime(args, type, op, root, 1));
       PRINT("\n");
@@ -1030,13 +1035,16 @@ testResult_t run() {
 #endif
   is_main_thread = (proc == 0) ? 1 : 0;
 
-  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
-      (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
-  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
-  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
-  PRINT("#\n");
+  #ifndef NCCL_DEBUG_CLOCK
+    PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
+    if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+    if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+    PRINT("#\n");
+
+    PRINT("# Using devices\n");
+  #endif
 
-  PRINT("# Using devices\n");
 #define MAX_LINE 2048
   char line[MAX_LINE];
   int len = 0;
@@ -1051,20 +1059,21 @@ testResult_t run() {
     maxMem = std::min(maxMem, prop.totalGlobalMem);
   }
 
-#if MPI_SUPPORT
-  char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
-  // Gather all output in rank order to root (0)
-  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
-  if (proc == 0) {
-    for (int p = 0; p < nProcs; p++)
-      PRINT("%s", lines+MAX_LINE*p);
-    free(lines);
-  }
-  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
-#else
-  PRINT("%s", line);
+#ifndef NCCL_DEBUG_CLOCK
+  #if MPI_SUPPORT
+    char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
+    // Gather all output in rank order to root (0)
+    MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+    if (proc == 0) {
+      for (int p = 0; p < nProcs; p++)
+        PRINT("%s", lines+MAX_LINE*p);
+      free(lines);
+    }
+    MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+  #else
+    PRINT("%s", line);
+  #endif
 #endif
-
   // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
   size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
   if (maxBytes > memMaxBytes) {
@@ -1121,8 +1130,10 @@ testResult_t run() {
     errors[t] = bw_count[t] = 0;
   }
 
-  PRINT("#\n");
-  print_header();
+  #ifndef NCCL_DEBUG_CLOCK
+    PRINT("#\n");
+    print_header();
+  #endif
 
   int* sync = (int*)calloc(2, sizeof(int));
   int* barrier = (int*)calloc(2, sizeof(int));
@@ -1202,9 +1213,14 @@ testResult_t run() {
   double check_avg_bw = str ? atof(str) : -1;
   bw[0] /= bw_count[0];
 
-  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
-  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
-  PRINT("#\n");
+  #ifndef NCCL_DEBUG_CLOCK
+    PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+    PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+    PRINT("#\n");
+  #else
+    PRINT("\n");
+    PRINT("\n");
+  #endif
 #ifdef MPI_SUPPORT
   MPI_Finalize();
 #endif
diff --git a/src/common.h b/src/common.h
index 745bd76..a6703b2 100644
--- a/src/common.h
+++ b/src/common.h
@@ -18,6 +18,8 @@
 
 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
 
+// #define NCCL_DEBUG_CLOCK 1
+
 #define CUDACHECK(cmd) do {                         \
   cudaError_t err = cmd;                            \
   if( err != cudaSuccess ) {                        \
diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh
index 7a0de3c..d69bf7d 100755
--- a/test_scripts/nccl/run_nccl.sh
+++ b/test_scripts/nccl/run_nccl.sh
@@ -6,7 +6,7 @@ export NCCL_ALGO=Ring
 # export NCCL_NTHREADS=64
 
 export DATE=221225
-export NCCL_ORDER=1
+export NCCL_ORDER=4
 
 for MY_NUM_DEV in 2 4 8
 do

From 12969b5b09eb9417b5b6a1905b95446913775193 Mon Sep 17 00:00:00 2001
From: novaCoder-zrk <zhangrongkai2000@163.com>
Date: Tue, 27 Dec 2022 09:18:53 +0000
Subject: [PATCH 086/109] =?UTF-8?q?=E8=83=BD=E5=A4=84=E7=90=86=E5=9D=87?=
 =?UTF-8?q?=E5=80=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py                | 161 +++++++++++++--------
 test_scripts/ofccl/static_ofccl_QE.cpp   | 174 +++++++++++++++++++++++
 test_scripts/ofccl/static_ofccl_bw.cpp   |  43 ++++++
 test_scripts/ofccl/static_ofccl_time.cpp |  40 ++++++
 4 files changed, 360 insertions(+), 58 deletions(-)
 create mode 100644 test_scripts/ofccl/static_ofccl_QE.cpp
 create mode 100644 test_scripts/ofccl/static_ofccl_bw.cpp
 create mode 100644 test_scripts/ofccl/static_ofccl_time.cpp

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index a99abab..0c4d9c7 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -1,6 +1,11 @@
 import os 
 import xlrd
 import xlwt
+# 设置字体大小
+style = xlwt.XFStyle()
+font = xlwt.Font()
+font.height = 20*16
+style.font = font
 # 设置环境变量
 os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib"
 os.environ['NCCL_PROTO'] = "Simple"
@@ -11,20 +16,23 @@
 os.environ['BASE_CTX_SWITCH_THRESHOLD'] = "80"
 os.environ['BOUNS_SWITCH_4_PROCESSED_COLL'] = "0"
 os.environ['DEV_TRY_ROUND'] = "10"
-# test
-# f = os.popen("./nccl/run.sh")
-# print(f.readlines())
+
 # 设置超参数
-# run
-DATE="221223"
-runNcclTest = True # 运行nccl测试
-collectNcclResult  = True  # 统计nccl测试结果，写入xls
-runOfcclTest = True# 运行ofccl测试
-collectOfcclResult = True # 统计ofccl测试结果，写入xls
+DATE="221226"
+runNcclTest = False # 运行nccl测试,仅输出原始结果
+staticNccl = False
+collectNcclResult  = True # 统计nccl测试结果，写入xls
+
+
+runOfcclTest = False# 运行ofccl测试
+staticOfccl = True
+staticOfcclExtral = True # 对ofccl的额外输出进行统计
+collectOfcclResult = True# 统计ofccl测试结果，写入xls
+
 
 NCCL_ORDER="1"
 host=os.environ.get("HOST")
-n = 8
+n = 5
 m = 1 #nccl
 w = 2
 M = 1 #ofccl
@@ -40,14 +48,19 @@
 # static 
 os.system("g++ ./nccl/static_nccl.cpp -o ./nccl/static_nccl.out")
 os.system("g++ ./nccl/static_time.cpp -o ./nccl/static_time.out")
-os.system("g++ ./ofccl/clear_static_ofccl_time.cpp -o ./ofccl/clear_static_ofccl_time.out")
-os.system("g++ ./ofccl/clear_static_ofccl.cpp -o ./ofccl/clear_static_ofccl.out")
-
+os.system("g++ ./ofccl/static_ofccl_time.cpp -o ./ofccl/static_ofccl_time.out")
+os.system("g++ ./ofccl/static_ofccl_bw.cpp -o ./ofccl/static_ofccl_bw.out")
+os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out")
 
 
 table = xlwt.Workbook()
 bwSheet = table.add_sheet('bw')
 tmSheet = table.add_sheet('time')
+# 列宽
+for i in range(30):
+    bwSheet.col(i).width = 13 * 256
+    tmSheet.col(i).width = 16 * 256
+
 cnt  = 0
 for MY_NUM_DEV in ncards:
 
@@ -65,24 +78,24 @@
     NCCL_OUTPUT_TIME_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards_time.txt"   
     
 
-    if runNcclTest == True:
+    if staticNccl == True:
 
         os.system("echo  $(date +%F%n%T)>>"+NCCL_OUTPUT_BW_PATH)
         os.system("echo  $(date +%F%n%T)>>"+NCCL_OUTPUT_TIME_PATH)
 
-        for iter in [1,2,3]:
-            NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
-            
+    for iter in [1,2,3]:
+        NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        if runNcclTest:
             os.system("echo $(date +%F%n%T)>> "+NCCL_RES_PATH)
             for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]:
                 os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RES_PATH)
-
+        if staticNccl:    
             os.system("./nccl/static_nccl.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) 
             os.system("./nccl/static_time.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) 
                    
     if collectNcclResult == True :
         # bus
-        bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡')
+        bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
 
         with open(NCCL_OUTPUT_BW_PATH) as f:
             content = f.read()
@@ -90,39 +103,39 @@
 
         axis_y =  ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]
         for a in range(0,25):
-            bwSheet.write(2+a+cnt*30,0,axis_y[a])                 
+            bwSheet.write(2+a+cnt*30,0,axis_y[a],style)                 
         #
         for k in [0,1,2]:
-            bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k))
+            bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k),style)
             for i in range(0,25):
-                bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2])
+                bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2],style)
 
-            bwSheet.write(1+cnt*30,1+15+k,'nccl-busbw'+str(k))
+            bwSheet.write(1+cnt*30,12+k,'nccl-busbw'+str(k),style)
             for i in range(0,25):
-                bwSheet.write(2+i+cnt*30,1+15+k,bw[i+k*50+25+2])
+                bwSheet.write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style)
         # avg
-        bwSheet.write(1+cnt*30, 4, 'avg-algbw')
-        bwSheet.write(1+cnt*30, 19, 'avg-busbw')
+        bwSheet.write(1+cnt*30, 4, 'avg-algbw',style)
+        bwSheet.write(1+cnt*30, 15, 'avg-busbw',style)
         for i in range(0,25):
-            bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') )
-            bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3')) 
+            bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style )
+            bwSheet.write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) 
         
         # time  
         with open(NCCL_OUTPUT_TIME_PATH) as f2:
             content2 = f2.read()
         times = content2.split()
 
-        tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡')
+        tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
         for a in range(0,25):
-            tmSheet.write(2+a+cnt*30,0,axis_y[a])
+            tmSheet.write(2+a+cnt*30,0,axis_y[a],style)
         for k in [0,1,2]:
-            tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k))
+            tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k),style)
             for i in range(0,25):
-                tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2])
+                tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2],style)
         # avg 
-        tmSheet.write(1+cnt*30, 4, 'avg-nccl')
+        tmSheet.write(1+cnt*30, 4, 'avg-nccl',style)
         for i in range(0,25):
-            tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') )
+            tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') ,style)
         
 
     #OFCCL      
@@ -133,20 +146,26 @@
     # 统计结果    
     OFCCL_OUTPUT_BW_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards.txt"  
     OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    OFCCL_OUTPUT_QE_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE.txt"  
 
-    if runOfcclTest == True: 
+    if staticOfccl == True: 
         os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH)
         os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_TIME_PATH)
+    if staticOfcclExtral:
+        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_PATH)    
 
-        for iter in [1,2,3]:
-            OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt"
-            
+    for iter in [1,2,3]:
+        OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt"
+        if runOfcclTest:
             os.system("echo $(date +%F%n%T)>> "+OFCCL_RES_PATH)
             for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]:
                 os.system("../build/ofccl_all_reduce_perf  -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ OFCCL_RES_PATH)
+        if staticOfccl:
+            os.system("./ofccl/static_ofccl_bw.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH) 
+            os.system("./ofccl/static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH)
+        if staticOfcclExtral:
+            os.system("./ofccl/static_ofccl_QE.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_PATH)
 
-            os.system("./ofccl/clear_static_ofccl.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) 
-            os.system("./ofccl/clear_static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) 
 
     if collectOfcclResult == True:
         
@@ -155,19 +174,19 @@
         bw = content2.split()
         #bus        
         for k in [0,1,2]:
-            bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k))
+            bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style)
             for i in range(0,25):
-                bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2])
+                bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2],style)
 
-            bwSheet.write(1+cnt*30,5+15+k,'ofccl-busbw'+str(k))
+            bwSheet.write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style)
             for i in range(0,25):
-                bwSheet.write(2+i+cnt*30,5+15+k,bw[i+k*50+25+2])
+                bwSheet.write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style)
         # avg
-        bwSheet.write(1+cnt*30, 4+4, 'avg-algbw')
-        bwSheet.write(1+cnt*30, 19+4, 'avg-busbw')
+        bwSheet.write(1+cnt*30,8, 'avg-algbw',style)
+        bwSheet.write(1+cnt*30, 19, 'avg-busbw',style)
         for i in range(0,25):
-            bwSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') )
-            bwSheet.write(2+i+cnt*30, 19+4, xlwt.Formula('SUM(U'+str(2+i+cnt*30+1)+',V'+str(2+i+cnt*30+1)+',W'+str(2+i+cnt*30+1)+')/3')) 
+            bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ,style)
+            bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) 
         
         # time  
         with open(OFCCL_OUTPUT_TIME_PATH) as f2:
@@ -175,22 +194,48 @@
         times = content2.split()
 
         for k in [0,1,2]:
-            tmSheet.write(1+cnt*30,5+k,'OFccl-'+str(k))
+            tmSheet.write(1+cnt*30,5+k,'ofccl-'+str(k),style)
             for i in range(0,25):
-                tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2])
+                tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2],style)
         # avg 
-        tmSheet.write(1+cnt*30, 4+4, 'avg-OFCCL')
+        tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style)
         for i in range(0,25):
-            tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') )
+            tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ,style)
 
     if collectNcclResult and collectOfcclResult:
-        bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl')
-        bwSheet.write(1+cnt*30, 24, '(ofccl-nccl)/nccl')
-        tmSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl')
+        bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style)
+        bwSheet.write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style)
+        tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style)
+        tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style)
+        for i in range(0,25):
+            bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)) ,style)
+            bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style)
+            tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style )
+            tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style )
+
+    # time 各个列的标题
+    if staticOfcclExtral:
+        tmSheet.write(1+cnt*30, 13,'nccl IO',style )
+        tmSheet.write(1+cnt*30, 14,'nccl kern',style )
+        tmSheet.write(1+cnt*30, 15,'ofccl-nccl kern',style )
+        tmSheet.write(1+cnt*30, 16,'before after get sqe',style )
+        tmSheet.write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style )
+        tmSheet.write(1+cnt*30, 18,'before after put cqe',style )
+        tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style )
+        tmSheet.write(1+cnt*30, 20,'occl rank0 time',style )
+        tmSheet.write(1+cnt*30, 21,'nccl kern ori',style )
+
+        with open(OFCCL_OUTPUT_QE_PATH) as f3:
+            content3 = f3.read()
+        times = content3.split()
         for i in range(0,25):
-            bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)) )
-            bwSheet.write(2+i+cnt*30, 24, xlwt.Formula('(X'+str(2+i+cnt*30+1)+'-T'+str(2+i+cnt*30+1)+')/T'+str(2+i+cnt*30+1) ))
-            tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ) )
+            tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style)
+            tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style)
+            tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style)
+            tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style)
+            tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style)
+
+
 
     cnt = cnt+1
 
diff --git a/test_scripts/ofccl/static_ofccl_QE.cpp b/test_scripts/ofccl/static_ofccl_QE.cpp
new file mode 100644
index 0000000..3705bdb
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_QE.cpp
@@ -0,0 +1,174 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+    string inputLine;
+    vector<string> time;
+    vector<double> sqe;
+    vector<double> beforeCqe;
+    vector<double> putCqe;
+    vector<double> afterCqe;
+    string bw="bandwidth";
+
+    int cnt = 0;
+    double sqe_sum = 0;
+    int sqe_cnt = 0;
+
+    double beforeCqe_sum=0;
+    int beforeCqe_cnt = 0;
+
+    double putCqe_sum = 0;
+    int putCqe_cnt = 0;
+
+    double afterCqe_sum = 0;
+    int afterCqe_cnt = 0;
+
+    while(getline(cin, inputLine)){
+        if(inputLine.find(bw,0) != -1){
+            // 判断结束一个输出
+            // before after get sqe
+            double sqe_avg = sqe_sum / sqe_cnt;
+            sqe.push_back(sqe_avg);
+            sqe_sum = 0;
+            sqe_cnt =0;
+            // AfterSqe TO BeforeCqe
+            double beforeCqe_avg = beforeCqe_sum / beforeCqe_cnt;
+            beforeCqe.push_back(beforeCqe_avg);
+            beforeCqe_sum =0;
+            beforeCqe_cnt =0;
+            //before after put cqe
+            double putCqe_avg = putCqe_sum / putCqe_cnt;
+            putCqe.push_back(putCqe_avg);
+            putCqe_sum = 0;
+            putCqe_cnt = 0;
+            //beforeSqe TO afterCqe
+            double afterCqe_avg = afterCqe_sum/afterCqe_cnt;
+            afterCqe.push_back(afterCqe_avg);
+            afterCqe_sum=0;
+            afterCqe_cnt=0;
+
+            if(++cnt == 25)
+            break;
+        }
+        // rank0 time
+        int pos = -1;
+        if ((pos=inputLine.find("time = ",0) ) != -1){
+            pos += 7;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos];
+                pos++;
+            }
+            time.push_back(t);
+            continue;
+        }
+
+        // before after get sqe
+        if ((pos=inputLine.find("before after get sqe AVG",0) ) != -1){
+            pos += 27;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos]; 
+                pos++;
+            }
+            stringstream ss;
+            double tt;
+            ss << t;
+            ss >> tt;
+            pos=inputLine.find("weight = ",0);
+            pos +=9;
+            int count = inputLine[pos] - '0';
+            sqe_sum += tt * count;
+            sqe_cnt += count; 
+            continue;
+        }
+        //AfterSqe TO BeforeCqe
+        if ((pos=inputLine.find("AfterSqe TO BeforeCqe AVG",0) ) != -1){
+            pos += 28;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos]; 
+                pos++;
+            }
+            stringstream ss;
+            double tt;
+            ss << t;
+            ss >> tt;
+            pos=inputLine.find("weight = ",0);
+            pos +=9;
+            int count = inputLine[pos] - '0';
+            beforeCqe_sum += tt * count;
+            beforeCqe_cnt += count; 
+            continue;
+        }
+
+        //before after put cqe
+        if ((pos=inputLine.find("before after put cqe AVG ",0) ) != -1){
+            pos += 27;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos]; 
+                pos++;
+            }
+            stringstream ss;
+            double tt;
+            ss << t;
+            ss >> tt;
+            pos=inputLine.find("weight = ",0);
+            pos +=9;
+            int count = inputLine[pos] - '0';
+            putCqe_sum += tt * count;
+            putCqe_cnt += count; 
+            continue;
+        }
+        //beforeSqe TO afterCqe 
+        if ((pos=inputLine.find("beforeSqe TO afterCqe AVG = ",0) ) != -1){
+            pos += 28;
+            string t="";
+            while(inputLine[pos] != ' '){
+                t += inputLine[pos]; 
+                pos++;
+            }
+            stringstream ss;
+            double tt;
+            ss << t;
+            ss >> tt;
+            pos=inputLine.find("weight = ",0);
+            pos +=9;
+            int count = inputLine[pos] - '0';
+            afterCqe_sum += tt * count;
+            afterCqe_cnt += count; 
+            continue;
+        }
+       
+        
+    }
+
+    // before after get sqe
+    for (auto s:sqe){
+        cout << s << endl;
+    }
+    cout <<endl;
+    // AfterSqe TO BeforeCqe
+    for(auto s:beforeCqe)
+        cout << s<<endl;
+    cout<<endl;
+    //before after put cqe 
+    for(auto s:putCqe)
+        cout << s<<endl;
+    cout<<endl;
+    // beforeSqe TO afterCqe 
+    for(auto s :afterCqe)
+        cout<<s<<endl;
+    cout<<endl;
+
+    // occl rank0 time
+    for(auto s:time)
+        cout<<s<<endl;
+    cout << endl<<endl<<endl<<endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_bw.cpp b/test_scripts/ofccl/static_ofccl_bw.cpp
new file mode 100644
index 0000000..4eeba2b
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_bw.cpp
@@ -0,0 +1,43 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+    string inputLine;
+    vector<string> a;
+    vector<string> b;
+    string ss="bandwidth";
+    string str = "N/A";
+    int cnt = 0;
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
+
+        stringstream line;
+        line << inputLine;
+        string tmp;
+        stack<string> ss;
+        while(line >> tmp){
+            ss.push(tmp);
+        }
+        ss.pop();
+        b.push_back(ss.top());
+        ss.pop();
+        a.push_back(ss.top());
+        
+        if(++cnt == 25)
+            break;
+    }
+
+    for(auto a1:a)
+        cout<<a1<<endl;
+
+    cout <<endl;
+    for(auto b1:b)
+        cout<<b1<<endl;
+    cout << endl;
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_time.cpp b/test_scripts/ofccl/static_ofccl_time.cpp
new file mode 100644
index 0000000..eed91a9
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_time.cpp
@@ -0,0 +1,40 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+    string inputLine;
+    vector<string> a;
+    vector<string> b;
+    string ss="bandwidth";
+    string str = "N/A";
+    int cnt = 0;
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
+
+        stringstream line;
+        line << inputLine;
+        string tmp;
+        stack<string> ss;
+        while(line >> tmp){
+            ss.push(tmp);
+        }
+        ss.pop();
+        ss.pop();
+        ss.pop();
+        a.push_back(ss.top());
+        
+        if(++cnt == 25)
+            break;
+    }
+
+    for(auto a1:a)
+        cout<<a1<<endl;
+
+    cout <<endl<< endl;
+}
\ No newline at end of file

From 29d300bdb58fd45e8e65509462ecaa433d897a24 Mon Sep 17 00:00:00 2001
From: novaCoder-zrk <zhangrongkai2000@163.com>
Date: Tue, 27 Dec 2022 10:47:31 +0000
Subject: [PATCH 087/109] =?UTF-8?q?time=E9=A1=B5=E8=A1=A8=20R=E5=88=97-O?=
 =?UTF-8?q?=E5=88=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 0c4d9c7..4cb745c 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -18,15 +18,15 @@
 os.environ['DEV_TRY_ROUND'] = "10"
 
 # 设置超参数
-DATE="221226"
+DATE="221227"
 runNcclTest = False # 运行nccl测试,仅输出原始结果
-staticNccl = False
+staticNccl = True
 collectNcclResult  = True # 统计nccl测试结果，写入xls
 
 
 runOfcclTest = False# 运行ofccl测试
 staticOfccl = True
-staticOfcclExtral = True # 对ofccl的额外输出进行统计
+staticOfcclExtral = False # 对ofccl的额外输出进行统计
 collectOfcclResult = True# 统计ofccl测试结果，写入xls
 
 
@@ -229,6 +229,7 @@
             content3 = f3.read()
         times = content3.split()
         for i in range(0,25):
+            tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'- O'+str(3+i+cnt*30) ),style )
             tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style)
             tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style)
             tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style)

From 2b4b937c9b674b9a2acd9a0afad0d3176eea3487 Mon Sep 17 00:00:00 2001
From: novaCoder-zrk <zhangrongkai2000@163.com>
Date: Tue, 27 Dec 2022 11:04:57 +0000
Subject: [PATCH 088/109] =?UTF-8?q?nccl=20kern=20=E6=B1=82=E5=B9=B3?=
 =?UTF-8?q?=E5=9D=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 4cb745c..57bab69 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -18,16 +18,16 @@
 os.environ['DEV_TRY_ROUND'] = "10"
 
 # 设置超参数
-DATE="221227"
+DATE="221226"
 runNcclTest = False # 运行nccl测试,仅输出原始结果
-staticNccl = True
-collectNcclResult  = True # 统计nccl测试结果，写入xls
+staticNccl = False # 运行统计，输出中间结果
+collectNcclResult  = True # 收集nccl测试结果，写入xls
 
 
 runOfcclTest = False# 运行ofccl测试
-staticOfccl = True
-staticOfcclExtral = False # 对ofccl的额外输出进行统计
-collectOfcclResult = True# 统计ofccl测试结果，写入xls
+staticOfccl = False # 运行统计，输出中间结果
+staticOfcclExtral = True # 对ofccl的额外输出进行统计
+collectOfcclResult = True# 收集ofccl测试结果，写入xls
 
 
 NCCL_ORDER="1"
@@ -229,7 +229,8 @@
             content3 = f3.read()
         times = content3.split()
         for i in range(0,25):
-            tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'- O'+str(3+i+cnt*30) ),style )
+            tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('( V'+str(3+i+cnt*30)+'+W'+str(3+i+cnt*30)+'+X'+str(3+i+cnt*30)+'+Y'+str(3+i+cnt*30)+'+Z'+str(3+i+cnt*30)+' )/5' ),style )
+            tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
             tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style)
             tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style)
             tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style)

From 93b9ddcb52e753464643f236a82d8d21aa6a3c91 Mon Sep 17 00:00:00 2001
From: novaCoder-zrk <zhangrongkai2000@163.com>
Date: Wed, 28 Dec 2022 04:46:34 +0000
Subject: [PATCH 089/109] =?UTF-8?q?=E8=BE=93=E5=87=BA=20ori?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py                  |  22 +++-
 test_scripts/ofccl/static_ofccl_QE_ori.cpp | 120 +++++++++++++++++++++
 2 files changed, 139 insertions(+), 3 deletions(-)
 create mode 100644 test_scripts/ofccl/static_ofccl_QE_ori.cpp

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 57bab69..964d5ea 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -7,6 +7,7 @@
 font.height = 20*16
 style.font = font
 # 设置环境变量
+#os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib"
 os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib"
 os.environ['NCCL_PROTO'] = "Simple"
 os.environ['NCCL_ALGO'] = "RING"
@@ -18,7 +19,6 @@
 os.environ['DEV_TRY_ROUND'] = "10"
 
 # 设置超参数
-DATE="221226"
 runNcclTest = False # 运行nccl测试,仅输出原始结果
 staticNccl = False # 运行统计，输出中间结果
 collectNcclResult  = True # 收集nccl测试结果，写入xls
@@ -29,7 +29,7 @@
 staticOfcclExtral = True # 对ofccl的额外输出进行统计
 collectOfcclResult = True# 收集ofccl测试结果，写入xls
 
-
+DATE="221226"
 NCCL_ORDER="1"
 host=os.environ.get("HOST")
 n = 5
@@ -147,12 +147,14 @@
     OFCCL_OUTPUT_BW_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards.txt"  
     OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt"  
     OFCCL_OUTPUT_QE_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    OFCCL_OUTPUT_QE_ORI_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
 
     if staticOfccl == True: 
         os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH)
         os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_TIME_PATH)
     if staticOfcclExtral:
-        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_PATH)    
+        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_PATH)
+        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_ORI_PATH)    
 
     for iter in [1,2,3]:
         OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt"
@@ -165,6 +167,7 @@
             os.system("./ofccl/static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH)
         if staticOfcclExtral:
             os.system("./ofccl/static_ofccl_QE.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_PATH)
+            os.system("./ofccl/static_ofccl_QE_ori.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_ORI_PATH)
 
 
     if collectOfcclResult == True:
@@ -224,10 +227,17 @@
         tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style )
         tmSheet.write(1+cnt*30, 20,'occl rank0 time',style )
         tmSheet.write(1+cnt*30, 21,'nccl kern ori',style )
+        tmSheet.write(1+cnt*30, 27,'before after get sqe ori',style )
+        tmSheet.write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style )
+        tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style )
+        tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style )
 
         with open(OFCCL_OUTPUT_QE_PATH) as f3:
             content3 = f3.read()
         times = content3.split()
+        with open(OFCCL_OUTPUT_QE_ORI_PATH) as f4:
+            content4 = f4.read()
+        times4 = content4.split()
         for i in range(0,25):
             tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('( V'+str(3+i+cnt*30)+'+W'+str(3+i+cnt*30)+'+X'+str(3+i+cnt*30)+'+Y'+str(3+i+cnt*30)+'+Z'+str(3+i+cnt*30)+' )/5' ),style )
             tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
@@ -236,6 +246,12 @@
             tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style)
             tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style)
             tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style)
+            for j in range(0,5):
+                tmSheet.write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style)
+                tmSheet.write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style)
+                tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style)
+                tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style)
+
 
 
 
diff --git a/test_scripts/ofccl/static_ofccl_QE_ori.cpp b/test_scripts/ofccl/static_ofccl_QE_ori.cpp
new file mode 100644
index 0000000..08794b5
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_QE_ori.cpp
@@ -0,0 +1,120 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+    string inputLine;
+
+    vector<double> sqe_ori;
+    vector<double> beforeCqe_ori;
+    vector<double> putCqe_ori;
+    vector<double> afterCqe_ori;
+    string bw="bandwidth";
+
+    
+    int cnt=0;
+    while(getline(cin, inputLine)){
+        if(inputLine.find(bw,0) != -1){
+            // 判断结束一个输出
+            // before after get sqe
+            
+            if(++cnt == 25)
+            break;
+        }
+        // rank0 time
+        int pos = -1;
+            // before after get sqe
+        if ((pos=inputLine.find("Rank<0> Blk<0> Thrd<0> coll_id = 0, before after get sqe = ",0) ) != -1){
+            pos += 58;
+            string numbers = inputLine.substr(pos);
+            stringstream ss ;
+            ss << numbers;
+            for(int i = 0;i < 5;i++){
+                double tmp;
+                ss >> tmp;
+                sqe_ori.push_back(tmp);
+            }
+            continue;
+        }
+        //AfterSqe TO BeforeCqe
+       if ((pos=inputLine.find("AfterSqe TO BeforeCqe = ",0) ) != -1){
+            pos += 24;
+            string numbers = inputLine.substr(pos);
+            stringstream ss ;
+            ss << numbers;
+            for(int i = 0;i < 5;i++){
+                double tmp;
+                ss >> tmp;
+                if(tmp > 0.00001)
+                    beforeCqe_ori.push_back(tmp);
+            }
+            continue;
+        }
+
+        //before after put cqe
+        if ((pos=inputLine.find("before after put cqe = ",0) ) != -1){
+            pos += 23;
+            string numbers = inputLine.substr(pos);
+            stringstream ss ;
+            ss << numbers;
+            for(int i = 0;i < 5;i++){
+                double tmp;
+                ss >> tmp;
+                if(tmp > 0.00001)
+                    putCqe_ori.push_back(tmp);
+            }
+            continue;
+        }
+
+        //beforeSqe TO afterCqe 
+        if ((pos=inputLine.find("beforeSqe TO afterCqe = ",0) ) != -1){
+            pos += 24;
+            string numbers = inputLine.substr(pos);
+            stringstream ss ;
+            ss << numbers;
+            for(int i = 0;i < 5;i++){
+                double tmp;
+                ss >> tmp;
+                if(tmp > 0.00001)
+                    afterCqe_ori.push_back(tmp);
+            }
+            continue;
+        }
+    }
+
+    // before after get sqe
+    for(int i = 0;i <25;i++){
+        for(int j =0;j < 5;j++)
+            cout<<sqe_ori[i*5+j]<<" ";
+        cout<<endl;
+    }
+    cout <<endl<<endl;
+    // // AfterSqe TO BeforeCqe
+    for(int i = 0;i <25;i++){
+        for(int j =0;j < 5;j++)
+            cout<<beforeCqe_ori[i*5+j]<<" ";
+        cout<<endl;
+    }
+    cout <<endl<<endl;
+
+    //before after put cqe 
+    for(int i = 0;i <25;i++){
+        for(int j =0;j < 5;j++)
+            cout<<putCqe_ori[i*5+j]<<" ";
+        cout<<endl;
+    }
+    cout <<endl<<endl;
+    // beforeSqe TO afterCqe 
+    for(int i = 0;i <25;i++){
+        for(int j =0;j < 5;j++)
+            cout<<afterCqe_ori[i*5+j]<<" ";
+        cout<<endl;
+    }
+    cout <<endl<<endl;
+
+    cout <<endl<<endl<<endl;
+}
\ No newline at end of file

From 4321b540f4bfa1a52e60e6cb7ef54be5a11d1a56 Mon Sep 17 00:00:00 2001
From: novaCoder-zrk <zhangrongkai2000@163.com>
Date: Wed, 28 Dec 2022 10:06:41 +0000
Subject: [PATCH 090/109] =?UTF-8?q?=E7=BC=96=E8=AF=91=20QE=5Foricpp=20?=
 =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=80=E5=88=97=E5=AE=9E=E9=99=85=E7=9A=84?=
 =?UTF-8?q?byte=E6=95=B0=EF=BC=8C=E5=A2=9E=E5=8A=A0=20Ex-Ox=EF=BC=8C?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9average?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 964d5ea..4f8a1e9 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -51,6 +51,7 @@
 os.system("g++ ./ofccl/static_ofccl_time.cpp -o ./ofccl/static_ofccl_time.out")
 os.system("g++ ./ofccl/static_ofccl_bw.cpp -o ./ofccl/static_ofccl_bw.out")
 os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out")
+os.system("g++ ./ofccl/static_ofccl_QE_ori.cpp -o ./ofccl/static_ofccl_QE_ori.out")
 
 
 table = xlwt.Workbook()
@@ -232,6 +233,11 @@
         tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style )
         tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style )
 
+        y = 64
+        for i in range(0,25):
+            tmSheet.write(2+i+cnt*30,12,y,style)
+            y = y*2    
+
         with open(OFCCL_OUTPUT_QE_PATH) as f3:
             content3 = f3.read()
         times = content3.split()
@@ -239,7 +245,8 @@
             content4 = f4.read()
         times4 = content4.split()
         for i in range(0,25):
-            tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('( V'+str(3+i+cnt*30)+'+W'+str(3+i+cnt*30)+'+X'+str(3+i+cnt*30)+'+Y'+str(3+i+cnt*30)+'+Z'+str(3+i+cnt*30)+' )/5' ),style )
+            tmSheet.write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
+            tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style )
             tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
             tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style)
             tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style)

From d3f652da48c3240b1c282c91ea333e722e34828e Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 29 Dec 2022 08:07:52 +0000
Subject: [PATCH 091/109] + in order ms

---
 ofccl_test.sh                 |  6 ++---
 src_manual_size/common_ms.cu  | 49 +++++++++++++++++++++++++----------
 src_manual_size/common_ms.h   |  2 ++
 test_scripts/auto_test.py     | 15 ++++++-----
 test_scripts/nccl/run_nccl.sh |  4 +--
 5 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 33982e7..68b8e8d 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -42,13 +42,13 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=8
+    export MY_NUM_DEV=2
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=64M
+    export NBYTES=64
     export WARMITER=2
     export MITER=1
     export CHECK=0
@@ -70,7 +70,7 @@ elif [ "$BINARY" == "MS" ];then
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
-    export NITER=200
+    export NITER=4
     export SHOW_ALL_PREPARED_COLL=1
     export WARMITER=0
     export NBYTES=8K
diff --git a/src_manual_size/common_ms.cu b/src_manual_size/common_ms.cu
index 0ed1041..2b8146c 100644
--- a/src_manual_size/common_ms.cu
+++ b/src_manual_size/common_ms.cu
@@ -17,20 +17,41 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
 #ifdef FULL_MS
   size_t countList[MULTI_ITERS] = {256, 147456, 256, 1024, 65536, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512, 262144, 1024, 2048, 2048, 262144, 2048, 512, 512, 262144, 2048, 1024, 262144, 256, 512, 512, 262144, 2048, 2048, 256, 512, 589824, 512, 262144, 2048, 524288, 512, 1024, 2359296, 2097152, 256, 256, 1024, 256, 1048576, 4096, 2048, 2048, 9437184, 8388608, 1048576, 4194304, 16384, 147456, 1048576, 4000, 1024, 512, 1024, 131072, 8192, 1024, 512, 4096, 1024, 9437184, 65536, 256, 2048, 8192, 4096, 1024, 8192, 2048, 2048, 2048, 1048576, 512, 4194304, 512, 8192, 1024, 2359296, 256, 8192, 1024, 4096, 1024, 1024, 589824, 4096, 4194304, 8192, 8192000, 512, 2048, 2048, 2048, 2048, 2048, 4096, 1048576, 1024, 2048, 256, 2359296, 589824, 1024, 1048576, 8192, 65536, 4096, 2048, 4096, 4096, 37632, 4194304, 1024, 8192, 9437184, 2048, 262144, 1048576, 256, 4194304, 1024, 1024, 1024, 1024, 1048576, 1024, 4096, 1048576, 1024, 1024, 4096, 2359296, 1024, 65536, 2097152, 4096, 1024, 1024, 512, 2359296, 1024, 4096, 65536, 2048, 2359296, 1048576, 1024, 1048576, 256, 1024, 4096};
-  int idxList[8][MULTI_ITERS] = {
-    {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
-    {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
-    },
-    {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
-    {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
-    },
-    {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
-    {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
-    },
-    {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
-    {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
-    }
-  };
+  #ifndef IN_ORDER
+    int idxList[8][MULTI_ITERS] = {
+      {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+      {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+      },
+      {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+      {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+      },
+      {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+      {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+      },
+      {104, 60, 103, 77, 90, 120, 73, 124, 125, 80, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 51, 52, 144, 140, 93, 109, 96, 122, 113, 66, 159, 55, 108, 97, 127, 130, 132, 87, 115, 61, 134, 136, 75, 137, 139, 138, 141, 135, 142, 116, 68, 145, 59, 86, 147, 149, 150, 131, 81, 151, 121, 155, 98, 156, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 69, 46, 43, 146, 42, 40, 79, 39, 38, 37, 118, 36, 35, 67, 126, 32, 33, 31, 30, 148, 114, 41, 29, 27, 25, 105, 24, 82, 23, 92, 22, 84, 20, 19, 21, 153, 18, 16, 15, 13, 14, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 6, 7, 71, 128, 28, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 2, 112, 1, 158, 48, 57, 94, 0, 88},
+      {60, 104, 103, 77, 90, 73, 120, 124, 80, 125, 129, 99, 117, 89, 106, 111, 70, 107, 102, 83, 65, 123, 85, 95, 56, 119, 78, 54, 53, 52, 51, 144, 140, 93, 109, 96, 122, 113, 159, 66, 55, 108, 97, 127, 132, 130, 87, 61, 115, 134, 75, 136, 137, 138, 139, 141, 142, 135, 116, 145, 68, 59, 147, 86, 149, 150, 131, 81, 151, 121, 155, 156, 98, 154, 110, 63, 157, 160, 50, 74, 72, 49, 47, 46, 69, 43, 146, 42, 40, 79, 39, 38, 118, 37, 36, 35, 67, 126, 33, 32, 31, 148, 30, 114, 41, 29, 27, 105, 25, 24, 82, 23, 92, 22, 84, 20, 19, 21, 18, 153, 16, 15, 14, 13, 12, 62, 11, 64, 133, 76, 152, 10, 34, 58, 101, 9, 8, 7, 6, 71, 28, 128, 5, 44, 45, 4, 3, 91, 17, 26, 143, 100, 112, 2, 1, 158, 48, 57, 94, 0, 88
+      }
+    };
+  #else
+    int idxList[8][MULTI_ITERS] = {
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      },
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160
+      }
+    };
+  #endif
 #else
   // size_t countList[MULTI_ITERS] = {256, 147456, 65536, 256, 1024, 147456, 1024, 1024, 65536, 256, 256, 512, 589824, 524288, 512, 512};
   // size_t idxList[8][MULTI_ITERS] = {
diff --git a/src_manual_size/common_ms.h b/src_manual_size/common_ms.h
index 1785efe..14f0ffb 100644
--- a/src_manual_size/common_ms.h
+++ b/src_manual_size/common_ms.h
@@ -28,6 +28,8 @@
   #define MULTI_ITERS 2
 #endif
 
+// #define IN_ORDER 1
+
 #define OFTEST_LOG(PRE, FMT, args...) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__, args)
 #define OFTEST_LOG1(PRE, FMT) printf("(testlog) [%s:%d] <%s> " #PRE " " FMT "\n", __FILE__, __LINE__, __func__)
 #define OFTEST_LOG0(PRE) printf("(testlog) [%s:%d] <%s> " #PRE "\n", __FILE__, __LINE__, __func__)
diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 964d5ea..9a6b99f 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -7,8 +7,8 @@
 font.height = 20*16
 style.font = font
 # 设置环境变量
-#os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib"
-os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib"
+os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib"
+# os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib"
 os.environ['NCCL_PROTO'] = "Simple"
 os.environ['NCCL_ALGO'] = "RING"
 
@@ -19,17 +19,17 @@
 os.environ['DEV_TRY_ROUND'] = "10"
 
 # 设置超参数
-runNcclTest = False # 运行nccl测试,仅输出原始结果
-staticNccl = False # 运行统计，输出中间结果
+runNcclTest = True # 运行nccl测试,仅输出原始结果
+staticNccl = True # 运行统计，输出中间结果
 collectNcclResult  = True # 收集nccl测试结果，写入xls
 
 
-runOfcclTest = False# 运行ofccl测试
-staticOfccl = False # 运行统计，输出中间结果
+runOfcclTest = True# 运行ofccl测试
+staticOfccl = True # 运行统计，输出中间结果
 staticOfcclExtral = True # 对ofccl的额外输出进行统计
 collectOfcclResult = True# 收集ofccl测试结果，写入xls
 
-DATE="221226"
+DATE="221229"
 NCCL_ORDER="1"
 host=os.environ.get("HOST")
 n = 5
@@ -51,6 +51,7 @@
 os.system("g++ ./ofccl/static_ofccl_time.cpp -o ./ofccl/static_ofccl_time.out")
 os.system("g++ ./ofccl/static_ofccl_bw.cpp -o ./ofccl/static_ofccl_bw.out")
 os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out")
+os.system("g++ ./ofccl/static_ofccl_QE_ori.cpp -o ./ofccl/static_ofccl_QE_ori.out")
 
 
 table = xlwt.Workbook()
diff --git a/test_scripts/nccl/run_nccl.sh b/test_scripts/nccl/run_nccl.sh
index d69bf7d..890e045 100755
--- a/test_scripts/nccl/run_nccl.sh
+++ b/test_scripts/nccl/run_nccl.sh
@@ -5,8 +5,8 @@ export NCCL_ALGO=Ring
 # export NCCL_MIN_NCHANNELS=1
 # export NCCL_NTHREADS=64
 
-export DATE=221225
-export NCCL_ORDER=4
+export DATE=221228
+export NCCL_ORDER=1
 
 for MY_NUM_DEV in 2 4 8
 do

From 5135aa3cbaeb04eccf3b75b6e5b33a6bf3c8bc61 Mon Sep 17 00:00:00 2001
From: novaCoder-zrk <zhangrongkai2000@163.com>
Date: Thu, 29 Dec 2022 08:15:28 +0000
Subject: [PATCH 092/109] =?UTF-8?q?=E8=BE=93=E5=87=BA=20totalCnt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py                    |  55 +++++++-
 test_scripts/ofccl/static_ofccl_totalCnt.cpp | 124 +++++++++++++++++++
 2 files changed, 176 insertions(+), 3 deletions(-)
 create mode 100644 test_scripts/ofccl/static_ofccl_totalCnt.cpp

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 4f8a1e9..22bd802 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -20,12 +20,12 @@
 
 # 设置超参数
 runNcclTest = False # 运行nccl测试,仅输出原始结果
-staticNccl = False # 运行统计，输出中间结果
-collectNcclResult  = True # 收集nccl测试结果，写入xls
+staticNccl = True # 运行统计，输出中间结果
+collectNcclResult  =True # 收集nccl测试结果，写入xls
 
 
 runOfcclTest = False# 运行ofccl测试
-staticOfccl = False # 运行统计，输出中间结果
+staticOfccl = True # 运行统计，输出中间结果
 staticOfcclExtral = True # 对ofccl的额外输出进行统计
 collectOfcclResult = True# 收集ofccl测试结果，写入xls
 
@@ -52,11 +52,13 @@
 os.system("g++ ./ofccl/static_ofccl_bw.cpp -o ./ofccl/static_ofccl_bw.out")
 os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out")
 os.system("g++ ./ofccl/static_ofccl_QE_ori.cpp -o ./ofccl/static_ofccl_QE_ori.out")
+os.system("g++ ./ofccl/static_ofccl_totalCnt.cpp -o ./ofccl/static_ofccl_totalCnt.out")
 
 
 table = xlwt.Workbook()
 bwSheet = table.add_sheet('bw')
 tmSheet = table.add_sheet('time')
+cntSheet = table.add_sheet('totalCnt')
 # 列宽
 for i in range(30):
     bwSheet.col(i).width = 13 * 256
@@ -149,6 +151,7 @@
     OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt"  
     OFCCL_OUTPUT_QE_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE.txt"  
     OFCCL_OUTPUT_QE_ORI_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    OFCCL_OUTPUT_TOTALCNT_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
 
     if staticOfccl == True: 
         os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH)
@@ -169,6 +172,7 @@
         if staticOfcclExtral:
             os.system("./ofccl/static_ofccl_QE.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_PATH)
             os.system("./ofccl/static_ofccl_QE_ori.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_ORI_PATH)
+            os.system("./ofccl/static_ofccl_totalCnt.out "+OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TOTALCNT_PATH)
 
 
     if collectOfcclResult == True:
@@ -259,6 +263,51 @@
                 tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style)
                 tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style)
 
+        # cntsheet
+        cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+        axis_y =  ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]
+        for a in range(0,25):
+            cntSheet.write(2+a+cnt*30,0,axis_y[a],style)
+
+        cntSheet.write(1+cnt*30,1,"totalCtxSaveCnt_avg",style)
+        cntSheet.write(1+cnt*30,2,"totalCtxLoadCnt_avg",style)
+        cntSheet.write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style)
+        cntSheet.write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style)
+        cntSheet.write(1+cnt*30,6,"totalCtxSaveCnt",style)
+        cntSheet.write(1+cnt*30,24,"totalCtxLoadCnt",style)
+        cntSheet.write(1+cnt*30,42,"totalProgressed7SwithchCnt",style)
+        cntSheet.write(1+cnt*30,60,"totalUnprogressedQuitCnt",style)
+
+        with  open(OFCCL_OUTPUT_TOTALCNT_PATH) as f:
+            line = f.readline()
+            # save
+            for i in range(0,25): 
+                numbers = line.split()
+                cntSheet.write(i+2+cnt*30,1,numbers[0])
+                for j in range(1,len(numbers)):
+                    cntSheet.write(i+2+cnt*30,5+j,numbers[j])
+                line = f.readline()
+            # load
+            for i in range(0,25): 
+                numbers = line.split()
+                cntSheet.write(i+2+cnt*30,2,numbers[0])
+                for j in range(1,len(numbers)):
+                    cntSheet.write(i+2+cnt*30,23+j,numbers[j])
+                line = f.readline()
+            # totalProgressed7SwithchCnt
+            for i in range(0,25): 
+                numbers = line.split()
+                cntSheet.write(i+2+cnt*30,3,numbers[0])
+                for j in range(1,len(numbers)):
+                    cntSheet.write(i+2+cnt*30,41+j,numbers[j])
+                line = f.readline()
+            # totalUnprogressedQuitCnt
+            for i in range(0,25): 
+                numbers = line.split()
+                cntSheet.write(i+2+cnt*30,4,numbers[0])
+                for j in range(1,len(numbers)):
+                    cntSheet.write(i+2+cnt*30,59+j,numbers[j])
+                line = f.readline()
 
 
 
diff --git a/test_scripts/ofccl/static_ofccl_totalCnt.cpp b/test_scripts/ofccl/static_ofccl_totalCnt.cpp
new file mode 100644
index 0000000..c1f78ee
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_totalCnt.cpp
@@ -0,0 +1,124 @@
+#include"bits/stdc++.h"
+#include <sstream>
+using namespace std;
+int main(int argc,char* argv[]){
+    
+    
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    
+   string inputLine;
+    vector<vector<int>> save_ori(25,vector<int>());
+    vector<vector<int>> load_ori(25,vector<int>());
+    vector<vector<int>> p7s_ori(25,vector<int>());
+    vector<vector<int>> quit_ori(25,vector<int>());
+    
+    vector<double> save_avg;
+    vector<double> load_avg;
+    vector<double> p7s_avg;
+    vector<double> quit_avg;
+   
+    string bw="bandwidth";
+
+    int cnt=0;
+    while(getline(cin, inputLine)){
+        if(inputLine.find(bw,0) != -1){
+            // 判断结束一个输出
+            // save
+            double sum = accumulate(begin(save_ori[cnt]), end(save_ori[cnt]), 0);
+            double mean =  sum / save_ori[cnt].size();
+            save_avg.push_back(mean);
+            // load
+            sum = accumulate(begin(load_ori[cnt]), end(load_ori[cnt]),0);
+            mean = sum / load_ori[cnt].size();
+            load_avg.push_back(mean);
+            // p7s
+            sum = accumulate(begin(p7s_ori[cnt]), end(p7s_ori[cnt]),0);
+            mean = sum / p7s_ori[cnt].size();
+            p7s_avg.push_back(mean);
+            // quit
+            sum = accumulate(begin(quit_ori[cnt]), end(quit_ori[cnt]),0);
+            mean = sum / quit_ori[cnt].size();
+            quit_avg.push_back(mean);
+
+            if(++cnt == 25)
+                break;
+        }
+      
+        int pos = 0;
+            // save
+        while((pos=inputLine.find("totalCtxSaveCnt=",pos) ) != -1){
+            pos += 16;
+            int number = 0;
+            while(inputLine[pos]>='0' &&inputLine[pos]<='9'){
+                number = number*10 + (inputLine[pos]-'0');
+                pos++;
+            }
+            save_ori[cnt].push_back(number);
+        }
+        pos=0;
+        while((pos=inputLine.find("totalCtxLoadCnt=",pos) ) != -1){
+            pos += 16;
+            int number = 0;
+            while(inputLine[pos]>='0' &&inputLine[pos]<='9'){
+                number = number*10 + (inputLine[pos]-'0');
+                pos++;
+            }
+            load_ori[cnt].push_back(number);
+        }
+
+        pos=0;
+        while((pos=inputLine.find("totalProgressed7SwithchCnt=",pos) ) != -1){
+            pos += 27;
+            int number = 0;
+            while(inputLine[pos]>='0' &&inputLine[pos]<='9'){
+                number = number*10 + (inputLine[pos]-'0');
+                pos++;
+            }
+            p7s_ori[cnt].push_back(number);
+        }
+
+        pos=0;
+        while((pos=inputLine.find("totalUnprogressedQuitCnt=",pos) ) != -1){
+            pos += 25;
+            int number = 0;
+            while(inputLine[pos]>='0' &&inputLine[pos]<='9'){
+                number = number*10 + (inputLine[pos]-'0');
+                pos++;
+            }
+            quit_ori[cnt].push_back(number);
+        }
+
+        
+    }
+
+    
+    for(int i = 0;i < 25;i++){
+        cout << save_avg[i]<<" ";
+        for(auto num:save_ori[i])
+            cout<<num<<" ";
+        cout<<endl;
+    }
+
+
+    for(int i =0;i < 25;i++){
+        cout<<load_avg[i]<<" ";
+        for(auto num:load_ori[i])
+            cout<<num<<" ";
+        cout<<endl;
+    }
+    for(int i =0;i < 25;i++){
+        cout<<p7s_avg[i]<<" ";
+        for(auto num:p7s_ori[i])
+            cout<<num<<" ";
+        cout<<endl;
+    }
+
+    for(int i =0;i < 25;i++){
+        cout<<quit_avg[i]<<" ";
+        for(auto num:quit_ori[i])
+            cout<<num<<" ";
+        cout<<endl;
+    }
+    cout <<endl;
+}
\ No newline at end of file

From ffcbfbabf4c10fcf20b7fb52c5fd903d8cc63216 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Thu, 29 Dec 2022 13:50:58 +0000
Subject: [PATCH 093/109] script

---
 ofccl_test.sh             | 2 +-
 test_scripts/auto_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 68b8e8d..841850d 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -42,7 +42,7 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=2
+    export MY_NUM_DEV=8
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 9a6b99f..469396a 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -30,7 +30,7 @@
 collectOfcclResult = True# 收集ofccl测试结果，写入xls
 
 DATE="221229"
-NCCL_ORDER="1"
+NCCL_ORDER="3"
 host=os.environ.get("HOST")
 n = 5
 m = 1 #nccl

From 57ee21fc8cf72e854a81d8795dd2904de39ff4c8 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 30 Dec 2022 11:04:02 +0000
Subject: [PATCH 094/109] scripts

---
 ofccl_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 841850d..68b8e8d 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -42,7 +42,7 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=8
+    export MY_NUM_DEV=2
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi

From 9ee720221ccd288eaba985e662ff1efb5241e613 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 1 Jan 2023 11:19:41 +0000
Subject: [PATCH 095/109] scripts

---
 test_scripts/auto_test.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 5cc47ee..de3e4ea 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -29,7 +29,11 @@
 staticOfcclExtral = True # 对ofccl的额外输出进行统计
 collectOfcclResult = True# 收集ofccl测试结果，写入xls
 
-DATE="221229"
+buffer_sizes = ["64", "128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]
+
+TINY_TEST = 0
+
+DATE="230101"
 NCCL_ORDER="3"
 host=os.environ.get("HOST")
 n = 5
@@ -43,6 +47,13 @@
     NUM_DEV = 8
     ncards = [2,4,8]
 
+if TINY_TEST == 1:
+    runNcclTest = False # 运行nccl测试,仅输出原始结果
+    staticNccl = False # 运行统计，输出中间结果
+    collectNcclResult  = False # 收集nccl测试结果，写入xls
+    ncards = [2]
+    # buffer_sizes = ["64", "128", "256", "512", "1K"]
+
 resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+str(m)+"n"+str(n)+"w"+str(w)+".xls"
 
 # static 
@@ -90,7 +101,7 @@
         NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
         if runNcclTest:
             os.system("echo $(date +%F%n%T)>> "+NCCL_RES_PATH)
-            for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]:
+            for a in buffer_sizes:
                 os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RES_PATH)
         if staticNccl:    
             os.system("./nccl/static_nccl.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) 
@@ -104,7 +115,7 @@
             content = f.read()
         bw = content.split()
 
-        axis_y =  ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]
+        axis_y =  buffer_sizes
         for a in range(0,25):
             bwSheet.write(2+a+cnt*30,0,axis_y[a],style)                 
         #
@@ -138,7 +149,7 @@
         # avg 
         tmSheet.write(1+cnt*30, 4, 'avg-nccl',style)
         for i in range(0,25):
-            tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3') ,style)
+            tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style)
         
 
     #OFCCL      
@@ -164,7 +175,7 @@
         OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt"
         if runOfcclTest:
             os.system("echo $(date +%F%n%T)>> "+OFCCL_RES_PATH)
-            for a in ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]:
+            for a in buffer_sizes:
                 os.system("../build/ofccl_all_reduce_perf  -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ OFCCL_RES_PATH)
         if staticOfccl:
             os.system("./ofccl/static_ofccl_bw.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH) 
@@ -193,7 +204,7 @@
         bwSheet.write(1+cnt*30,8, 'avg-algbw',style)
         bwSheet.write(1+cnt*30, 19, 'avg-busbw',style)
         for i in range(0,25):
-            bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ,style)
+            bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
             bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) 
         
         # time  
@@ -208,7 +219,7 @@
         # avg 
         tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style)
         for i in range(0,25):
-            tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3') ,style)
+            tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
 
     if collectNcclResult and collectOfcclResult:
         bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style)
@@ -216,7 +227,7 @@
         tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style)
         tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style)
         for i in range(0,25):
-            bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)) ,style)
+            bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style)
             bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style)
             tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style )
             tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style )
@@ -265,7 +276,7 @@
 
         # cntsheet
         cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
-        axis_y =  ["64" ,"128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]
+        axis_y =  buffer_sizes
         for a in range(0,25):
             cntSheet.write(2+a+cnt*30,0,axis_y[a],style)
 

From 665de439cff3cdae5dc08f3c154d631355b6c23f Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 6 Jan 2023 03:28:59 +0000
Subject: [PATCH 096/109] scripts

---
 ofccl_test.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 68b8e8d..b48a426 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -46,6 +46,9 @@ if [ "$BINARY" == "DEBUG" ];then
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
+    if [ $MY_NUM_DEV = 2 ]; then
+        export CUDA_VISIBLE_DEVICES=4,5
+    fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
     export NBYTES=64
@@ -66,11 +69,11 @@ elif [ "$BINARY" == "PERF" ];then
     export CHECK=0
 elif [ "$BINARY" == "MS" ];then
     target="./build/ofccl_all_reduce_ms_perf"
-    export MY_NUM_DEV=8
+    export MY_NUM_DEV=4
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
-    export NITER=4
+    export NITER=200
     export SHOW_ALL_PREPARED_COLL=1
     export WARMITER=0
     export NBYTES=8K

From b5a42cca4cf37ec6f25d9d111c19109a3a4c9bc4 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 6 Jan 2023 03:57:24 +0000
Subject: [PATCH 097/109] scripts

---
 test_scripts/auto_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index de3e4ea..1f89fe8 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -33,8 +33,8 @@
 
 TINY_TEST = 0
 
-DATE="230101"
-NCCL_ORDER="3"
+DATE="230106"
+NCCL_ORDER="1"
 host=os.environ.get("HOST")
 n = 5
 m = 1 #nccl

From 197018788d9228921e4f2bbc0e3a277b981477b3 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 8 Jan 2023 14:49:22 +0000
Subject: [PATCH 098/109] scripts

---
 ofccl_test.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index b48a426..dfefb3a 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -1,7 +1,7 @@
 clear
 
-export DEBUG_CC=0
-export DEBUG_ENQ=0
+export DEBUG_CC=1
+export DEBUG_ENQ=1
 
 cd /home/panlichen/work2/nccl-tests
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
@@ -95,7 +95,7 @@ if [ "$RUN_TYPE" == "PURE" ];then
     cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
 elif [ "$RUN_TYPE" == "GDB" ];then
     cmd="cuda-gdb $target"
-    # set args -b 8M -e 8M -f 2 -t 2 -g 1 -n 1 -w 0 -c 0
+    # set args -b 64 -e 64 -f 2 -t 2 -g 1 -n 1 -w 0 -c 0
 elif [ "$RUN_TYPE" == "NSYS" ];then
     cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work2/ofccl/log/nsys/$NSYS_FILE $target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
 elif [ "$RUN_TYPE" == "NCU" ];then

From ceb3a5aaf8d5de8138bff022c73d5bd62a823f02 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Tue, 10 Jan 2023 09:57:36 +0000
Subject: [PATCH 099/109] scripts

---
 ofccl_test.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index dfefb3a..dcd4868 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -18,6 +18,10 @@ export TOLERANT_UNPROGRESSED_CNT=10000
 export BASE_CTX_SWITCH_THRESHOLD=80
 export BOUNS_SWITCH_4_PROCESSED_COLL=0
 export DEV_TRY_ROUND=10
+export DEBUG_FILE="/home/panlichen/work2/ofccl/log/oneflow_cpu_rank_"
+
+rm -rf /home/panlichen/work2/ofccl/log
+mkdir -p /home/panlichen/work2/ofccl/log
 
 # export ENABLE_VQ=1 # volunteer quit
 # export TOLERANT_FAIL_CHECK_SQ_CNT=5000
@@ -28,6 +32,7 @@ echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
 echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
 echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL
 echo DEV_TRY_ROUND=$DEV_TRY_ROUND
+echo DEBUG_FILE=$DEBUG_FILE
 
 if [ ! -z $ENABLE_VQ ];then
     echo TOLERANT_FAIL_CHECK_SQ_CNT=$TOLERANT_FAIL_CHECK_SQ_CNT
@@ -51,7 +56,7 @@ if [ "$BINARY" == "DEBUG" ];then
     fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=64
+    export NBYTES=2
     export WARMITER=2
     export MITER=1
     export CHECK=0

From ac30fd49fa15a9131b03ea6ad7d3f2d19fcd8659 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Wed, 11 Jan 2023 09:55:49 +0000
Subject: [PATCH 100/109] datatype in cmd; MY_NUM_DEV as cmd line param

---
 ofccl_test.sh | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index dcd4868..9302e61 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -1,5 +1,7 @@
 clear
 
+export MY_NUM_DEV=$1
+
 export DEBUG_CC=1
 export DEBUG_ENQ=1
 
@@ -47,7 +49,6 @@ fi
 
 if [ "$BINARY" == "DEBUG" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=2
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
@@ -62,7 +63,6 @@ if [ "$BINARY" == "DEBUG" ];then
     export CHECK=0
 elif [ "$BINARY" == "PERF" ];then
     target="./build/ofccl_all_reduce_perf"
-    export MY_NUM_DEV=8
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
@@ -74,7 +74,6 @@ elif [ "$BINARY" == "PERF" ];then
     export CHECK=0
 elif [ "$BINARY" == "MS" ];then
     target="./build/ofccl_all_reduce_ms_perf"
-    export MY_NUM_DEV=4
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
@@ -96,8 +95,42 @@ if [ -z $RUN_TYPE ];then
     # RUN_TYPE="NCU"
 fi
 
+# typedef enum { ncclInt8       = 0, ncclChar       = 0,
+#                ncclUint8      = 1,
+#                ncclInt32      = 2, ncclInt        = 2,
+#                ncclUint32     = 3,
+#                ncclInt64      = 4,
+#                ncclUint64     = 5,
+#                ncclFloat16    = 6, ncclHalf       = 6,
+#                ncclFloat32    = 7, ncclFloat      = 7,
+#                ncclFloat64    = 8, ncclDouble     = 8,
+# #if defined(__CUDA_BF16_TYPES_EXIST__)
+#                ncclBfloat16   = 9,
+#                ncclNumTypes   = 10
+# #else
+#                ncclNumTypes   = 9
+# #endif
+# } ncclDataType_t;
+
+# 用这个：
+# const char *test_typenames[ncclNumTypes] = {"int8",
+#                                             "uint8",
+#                                             "int32",
+#                                             "uint32",
+#                                             "int64",
+#                                             "uint64",
+#                                             "half",
+#                                             "float",
+#                                             "double"
+# #if defined(__CUDA_BF16_TYPES_EXIST__) &&                                      \
+#     NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
+#                                             ,
+#                                             "bfloat16"
+# #endif
+# };
+
 if [ "$RUN_TYPE" == "PURE" ];then
-    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+    cmd="$target -d half -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
 elif [ "$RUN_TYPE" == "GDB" ];then
     cmd="cuda-gdb $target"
     # set args -b 64 -e 64 -f 2 -t 2 -g 1 -n 1 -w 0 -c 0

From 82c2a8e19b957cd30e4e2d08b27bf2f0ab174fee Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Fri, 13 Jan 2023 09:18:47 +0000
Subject: [PATCH 101/109] scripts

---
 ofccl_test.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 9302e61..f080213 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -20,6 +20,7 @@ export TOLERANT_UNPROGRESSED_CNT=10000
 export BASE_CTX_SWITCH_THRESHOLD=80
 export BOUNS_SWITCH_4_PROCESSED_COLL=0
 export DEV_TRY_ROUND=10
+export CHECK_REMAINING_SQE_INTERVAL=10000
 export DEBUG_FILE="/home/panlichen/work2/ofccl/log/oneflow_cpu_rank_"
 
 rm -rf /home/panlichen/work2/ofccl/log
@@ -34,6 +35,7 @@ echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
 echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
 echo BOUNS_SWITCH_4_PROCESSED_COLL=$BOUNS_SWITCH_4_PROCESSED_COLL
 echo DEV_TRY_ROUND=$DEV_TRY_ROUND
+echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL
 echo DEBUG_FILE=$DEBUG_FILE
 
 if [ ! -z $ENABLE_VQ ];then
@@ -57,7 +59,7 @@ if [ "$BINARY" == "DEBUG" ];then
     fi
     export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=2
+    export NBYTES=64
     export WARMITER=2
     export MITER=1
     export CHECK=0

From c760c82589e32f259af2853f498847dfe0218b9f Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 14 Jan 2023 14:21:15 +0000
Subject: [PATCH 102/109] + occl AllGather

---
 nccl_test.sh                   |  27 ++++--
 ofccl_test.sh                  |  34 +++++---
 src_simple/Makefile            |   3 +-
 src_simple/common_simple.h     |   1 +
 src_simple/ofccl_all_gather.cu | 151 +++++++++++++++++++++++++++++++++
 5 files changed, 194 insertions(+), 22 deletions(-)
 create mode 100644 src_simple/ofccl_all_gather.cu

diff --git a/nccl_test.sh b/nccl_test.sh
index 80a203f..de799b2 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -1,5 +1,7 @@
 clear
 
+export MY_NUM_DEV=$1
+
 cd /home/panlichen/work2/nccl-tests
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
 export NCCL_PROTO=Simple
@@ -14,33 +16,40 @@ if [ -z $BINARY ];then
     # BINARY="PERF"
 fi
 
-if [ "$BINARY" == "DEBUG" ];then
+FUNC=$2
+
+if [ "$FUNC" == "AR" ]; then
     target="./build/all_reduce_perf"
-    export MY_NUM_DEV=2
+elif [ "$FUNC" == "AG" ]; then
+    target="./build/all_gather_perf"
+elif [ "$FUNC" == "RS" ]; then
+    target="./build/reduce_scatter_perf"
+elif [ "$FUNC" == "R" ]; then
+    target="./build/reduce_perf"
+elif [ "$FUNC" == "B" ]; then
+    target="./build/broadcast_perf"
+fi
+
+
+if [ "$BINARY" == "DEBUG" ];then
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
-    export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=64
+    export NBYTES=1G
     export WARMITER=2
     export MITER=1
     export CHECK=0
 elif [ "$BINARY" == "PERF" ];then
-    target="./build/all_reduce_perf"
-    export MY_NUM_DEV=8
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
-    export SHOW_ALL_PREPARED_COLL=0
     export NITER=4
     export NBYTES=8K
     export WARMITER=2
     export MITER=4
     export CHECK=0
 elif [ "$BINARY" == "MS" ];then
-    export MY_NUM_DEV=8
-    # target="./build/ofccl_all_reduce_ms_perf"
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
diff --git a/ofccl_test.sh b/ofccl_test.sh
index f080213..514b756 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -9,11 +9,12 @@ cd /home/panlichen/work2/nccl-tests
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
 export NCCL_PROTO=Simple
 export NCCL_ALGO=Ring
-# export NCCL_MAX_NCHANNELS=1
-# export NCCL_MIN_NCHANNELS=1
-# export NCCL_NTHREADS=64
+export NCCL_MAX_NCHANNELS=1
+export NCCL_MIN_NCHANNELS=1
+export NCCL_NTHREADS=64
 
 export CHECK=0
+export SHOW_ALL_PREPARED_COLL=0
 
 export TRAVERSE_TIMES=10
 export TOLERANT_UNPROGRESSED_CNT=10000
@@ -43,6 +44,23 @@ if [ ! -z $ENABLE_VQ ];then
     echo CNT_BEFORE_QUIT=$CNT_BEFORE_QUIT
 fi
 
+FUNC=$2
+if [ -z $FUNC ]; then
+    FUNC="AR"
+fi
+
+if [ "$FUNC" == "AR" ]; then
+    target="./build/ofccl_all_reduce_perf"
+elif [ "$FUNC" == "AG" ]; then
+    target="./build/ofccl_all_gather_perf"
+elif [ "$FUNC" == "RS" ]; then
+    target="./build/ofccl_reduce_scatter_perf"
+elif [ "$FUNC" == "R" ]; then
+    target="./build/ofccl_reduce_perf"
+elif [ "$FUNC" == "B" ]; then
+    target="./build/ofccl_broadcast_perf"
+fi
+
 if [ -z $BINARY ];then
     BINARY="DEBUG"
     # BINARY="MS"
@@ -50,30 +68,24 @@ if [ -z $BINARY ];then
 fi
 
 if [ "$BINARY" == "DEBUG" ];then
-    target="./build/ofccl_all_reduce_perf"
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
     if [ $MY_NUM_DEV = 2 ]; then
         export CUDA_VISIBLE_DEVICES=4,5
     fi
-    export SHOW_ALL_PREPARED_COLL=0
     export NITER=5
-    export NBYTES=64
+    export NBYTES=1G
     export WARMITER=2
     export MITER=1
-    export CHECK=0
 elif [ "$BINARY" == "PERF" ];then
-    target="./build/ofccl_all_reduce_perf"
     if [ $MY_NUM_DEV = 4 ]; then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
-    export SHOW_ALL_PREPARED_COLL=0
     export NITER=8
     export NBYTES=8K
     export WARMITER=2
     export MITER=1
-    export CHECK=0
 elif [ "$BINARY" == "MS" ];then
     target="./build/ofccl_all_reduce_ms_perf"
     if [ $MY_NUM_DEV = 4 ]; then
@@ -132,7 +144,7 @@ fi
 # };
 
 if [ "$RUN_TYPE" == "PURE" ];then
-    cmd="$target -d half -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER"
+    cmd="$target -b $NBYTES -e $NBYTES -f 2 -t $MY_NUM_DEV -g 1 -n $NITER -w $WARMITER -c $CHECK -M $MITER" #  -d half
 elif [ "$RUN_TYPE" == "GDB" ];then
     cmd="cuda-gdb $target"
     # set args -b 64 -e 64 -f 2 -t 2 -g 1 -n 1 -w 0 -c 0
diff --git a/src_simple/Makefile b/src_simple/Makefile
index ccad131..c007331 100644
--- a/src_simple/Makefile
+++ b/src_simple/Makefile
@@ -89,8 +89,7 @@ $(info CARDNAME $(NVCUFLAGS))
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-# BIN_FILES_LIST := all_reduce_group all_reduce_simple ofccl_all_reduce
-BIN_FILES_LIST := ofccl_all_reduce
+BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index 8801172..9236d77 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -74,6 +74,7 @@ typedef enum {
 typedef struct {
   int collId;
   int gotCqe;
+  // int cqeCnt;
   pthread_mutex_t mutex;
 } CallBackArgs;
 
diff --git a/src_simple/ofccl_all_gather.cu b/src_simple/ofccl_all_gather.cu
new file mode 100644
index 0000000..26fd9bb
--- /dev/null
+++ b/src_simple/ofccl_all_gather.cu
@@ -0,0 +1,151 @@
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s            out-of-place                       in-place          \n", "", "", "");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s", size, count, typeName);
+}
+
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count/nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = count/nranks;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId);
+
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+  return 0;
+}
+
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+  NCCLCHECK(ofcclRunAllGather(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllGather for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunAllGather sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareAllGather(count, datatype, op, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllGather with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl allGatherTest = {
+  "AllGather",
+  AllGatherGetCollByteCount,
+  AllGatherInitData,
+  AllGatherGetBw,
+  AllGatherRunColl,
+  AllGatherPrepare
+};
+
+void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allGatherTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allGatherEngine = {
+  AllGatherGetBuffSize,
+  AllGatherRunTest
+};
+
+#pragma weak ncclTestEngine=allGatherEngine
+

From abd0c145531a9984a3409ec802efb6a6263de949 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sat, 14 Jan 2023 15:08:38 +0000
Subject: [PATCH 103/109] 5555 remove NCCL_MIN_NCHANNELS limit T^T TAT T_T T-T

---
 ofccl_test.sh             | 6 +++---
 test_scripts/auto_test.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ofccl_test.sh b/ofccl_test.sh
index 514b756..d94f55b 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -9,9 +9,9 @@ cd /home/panlichen/work2/nccl-tests
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
 export NCCL_PROTO=Simple
 export NCCL_ALGO=Ring
-export NCCL_MAX_NCHANNELS=1
-export NCCL_MIN_NCHANNELS=1
-export NCCL_NTHREADS=64
+# export NCCL_MAX_NCHANNELS=1
+# export NCCL_MIN_NCHANNELS=1
+# export NCCL_NTHREADS=64
 
 export CHECK=0
 export SHOW_ALL_PREPARED_COLL=0
diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 1f89fe8..ac5f5f7 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -33,7 +33,7 @@
 
 TINY_TEST = 0
 
-DATE="230106"
+DATE="230114"
 NCCL_ORDER="1"
 host=os.environ.get("HOST")
 n = 5

From bbb04c67cff6a4034e444cc598caa366fd7500ba Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 15 Jan 2023 03:09:53 +0000
Subject: [PATCH 104/109] + ofccl ReduceScatter

---
 ofccl_test.sh                      |   2 +-
 src_simple/Makefile                |   2 +-
 src_simple/ofccl_reduce_scatter.cu | 153 +++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 src_simple/ofccl_reduce_scatter.cu

diff --git a/ofccl_test.sh b/ofccl_test.sh
index d94f55b..f3270ab 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -75,7 +75,7 @@ if [ "$BINARY" == "DEBUG" ];then
         export CUDA_VISIBLE_DEVICES=4,5
     fi
     export NITER=5
-    export NBYTES=1G
+    export NBYTES=64M
     export WARMITER=2
     export MITER=1
 elif [ "$BINARY" == "PERF" ];then
diff --git a/src_simple/Makefile b/src_simple/Makefile
index c007331..cc93c28 100644
--- a/src_simple/Makefile
+++ b/src_simple/Makefile
@@ -89,7 +89,7 @@ $(info CARDNAME $(NVCUFLAGS))
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather
+BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src_simple/ofccl_reduce_scatter.cu b/src_simple/ofccl_reduce_scatter.cu
new file mode 100644
index 0000000..44c3c10
--- /dev/null
+++ b/src_simple/ofccl_reduce_scatter.cu
@@ -0,0 +1,153 @@
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s", size, count, typeName, opName);
+}
+
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = *recvcount;
+}
+
+testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get cqe for coll_id = %d", pthread_self(), cudaDev, collId);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId);
+
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+  return 0;
+}
+
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+  NCCLCHECK(ofcclRunReduceScatter(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduceScatter for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduceScatter sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t ReduceScatterPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareReduceScatter(count, datatype, op, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareReduceScatter with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl reduceScatterTest = {
+  "ReduceScatter",
+  ReduceScatterGetCollByteCount,
+  ReduceScatterInitData,
+  ReduceScatterGetBw,
+  ReduceScatterRunColl,
+  ReduceScatterPrepare
+};
+
+void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceScatterTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1, true));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine reduceScatterEngine = {
+  ReduceScatterGetBuffSize,
+  ReduceScatterRunTest
+};
+
+#pragma weak ncclTestEngine=reduceScatterEngine
+
+
+

From 1f099a19a7a8250a033f536a60cd6d722b719b2d Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 15 Jan 2023 04:48:00 +0000
Subject: [PATCH 105/109] + occl reduce

---
 src_simple/Makefile                |   2 +-
 src_simple/common_simple.cu        |   2 +-
 src_simple/common_simple.h         |   2 +-
 src_simple/ofccl_all_gather.cu     |   2 +-
 src_simple/ofccl_all_reduce.cu     |   2 +-
 src_simple/ofccl_reduce.cu         | 159 +++++++++++++++++++++++++++++
 src_simple/ofccl_reduce_scatter.cu |   2 +-
 7 files changed, 165 insertions(+), 6 deletions(-)
 create mode 100644 src_simple/ofccl_reduce.cu

diff --git a/src_simple/Makefile b/src_simple/Makefile
index cc93c28..1812202 100644
--- a/src_simple/Makefile
+++ b/src_simple/Makefile
@@ -89,7 +89,7 @@ $(info CARDNAME $(NVCUFLAGS))
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter
+BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter ofccl_reduce
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src_simple/common_simple.cu b/src_simple/common_simple.cu
index 52d6be6..fc1d809 100644
--- a/src_simple/common_simple.cu
+++ b/src_simple/common_simple.cu
@@ -666,7 +666,7 @@ testResult_t prepareColl(struct threadArgs *args, ncclDataType_t type,
           &op, &u64, type, ncclScalarHostImmediate, comm));
     }
 #endif
-    TESTCHECK(args->collTest->prepareColl(count, type, op, comm, miter, rankCtx));
+    TESTCHECK(args->collTest->prepareColl(count, type, op, root, comm, miter, rankCtx));
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
     if (opIndex >= ncclNumOps) {
diff --git a/src_simple/common_simple.h b/src_simple/common_simple.h
index 9236d77..daba610 100644
--- a/src_simple/common_simple.h
+++ b/src_simple/common_simple.h
@@ -90,7 +90,7 @@ struct testColl {
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
   testResult_t (*runColl)(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx);
-  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx);
+  testResult_t (*prepareColl)(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
diff --git a/src_simple/ofccl_all_gather.cu b/src_simple/ofccl_all_gather.cu
index 26fd9bb..b22aab9 100644
--- a/src_simple/ofccl_all_gather.cu
+++ b/src_simple/ofccl_all_gather.cu
@@ -86,7 +86,7 @@ testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   return testSuccess;
 }
 
-testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
 
   NCCLCHECK(ofcclPrepareAllGather(count, datatype, op, comm, collId, rankCtx));
   // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllGather with count=%lu, collId=%d", pthread_self(), count, collId);
diff --git a/src_simple/ofccl_all_reduce.cu b/src_simple/ofccl_all_reduce.cu
index 50aaad8..7dd65d9 100644
--- a/src_simple/ofccl_all_reduce.cu
+++ b/src_simple/ofccl_all_reduce.cu
@@ -96,7 +96,7 @@ testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
   return testSuccess;
 }
 
-testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+testResult_t AllReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
 
   NCCLCHECK(ofcclPrepareAllReduce(count, datatype, op, comm, collId, rankCtx));
   // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllReduce with count=%lu, collId=%d", pthread_self(), count, collId);
diff --git a/src_simple/ofccl_reduce.cu b/src_simple/ofccl_reduce.cu
new file mode 100644
index 0000000..33db29c
--- /dev/null
+++ b/src_simple/ofccl_reduce.cu
@@ -0,0 +1,159 @@
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6s  %6i", size, count, typeName, opName, root);
+}
+
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  *busBw = baseBw;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId);
+
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+  return 0;
+}
+
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+  NCCLCHECK(ofcclRunReduce(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduce for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunReduce sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t ReducePrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareReduce(count, datatype, op, root, comm, collId, rankCtx));
+  // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareReduce with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl reduceTest = {
+  "Reduce",
+  ReduceGetCollByteCount,
+  ReduceInitData,
+  ReduceGetBw,
+  ReduceRunColl,
+  ReducePrepare
+};
+
+void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      for (int k=begin_root; k<=end_root; k++) {
+        TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
+      }
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine reduceEngine = {
+  ReduceGetBuffSize,
+  ReduceRunTest
+};
+
+#pragma weak ncclTestEngine=reduceEngine
+
+
diff --git a/src_simple/ofccl_reduce_scatter.cu b/src_simple/ofccl_reduce_scatter.cu
index 44c3c10..84d99bc 100644
--- a/src_simple/ofccl_reduce_scatter.cu
+++ b/src_simple/ofccl_reduce_scatter.cu
@@ -86,7 +86,7 @@ testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, int collId, Ca
   return testSuccess;
 }
 
-testResult_t ReduceScatterPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+testResult_t ReduceScatterPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
 
   NCCLCHECK(ofcclPrepareReduceScatter(count, datatype, op, comm, collId, rankCtx));
   // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareReduceScatter with count=%lu, collId=%d", pthread_self(), count, collId);

From 6c075fd5521492d1c337e24205a551e36e490a36 Mon Sep 17 00:00:00 2001
From: Panlichen <myles.l.pan@gmail.com>
Date: Sun, 15 Jan 2023 11:16:14 +0000
Subject: [PATCH 106/109] +ofccl_broadcast; fix DEBUG_NT

---
 nccl_test.sh                   |   2 +-
 ofccl_test.sh                  |   6 ++
 src/Makefile                   |   4 +-
 src_inplace/Makefile           |   4 +-
 src_manual_size/Makefile       |   4 +-
 src_nccl_manual_size/Makefile  |   4 +-
 src_simple/Makefile            |   6 +-
 src_simple/ofccl_all_gather.cu |   2 +-
 src_simple/ofccl_broadcast.cu  | 146 +++++++++++++++++++++++++++++++++
 9 files changed, 165 insertions(+), 13 deletions(-)
 create mode 100644 src_simple/ofccl_broadcast.cu

diff --git a/nccl_test.sh b/nccl_test.sh
index de799b2..1435e51 100644
--- a/nccl_test.sh
+++ b/nccl_test.sh
@@ -36,7 +36,7 @@ if [ "$BINARY" == "DEBUG" ];then
         export CUDA_VISIBLE_DEVICES=0,1,4,5
     fi
     export NITER=5
-    export NBYTES=1G
+    export NBYTES=64M
     export WARMITER=2
     export MITER=1
     export CHECK=0
diff --git a/ofccl_test.sh b/ofccl_test.sh
index f3270ab..1e62664 100644
--- a/ofccl_test.sh
+++ b/ofccl_test.sh
@@ -5,6 +5,12 @@ export MY_NUM_DEV=$1
 export DEBUG_CC=1
 export DEBUG_ENQ=1
 
+unset DEBUG_CC
+unset DEBUG_ENQ
+
+export DEBUG_NT=1
+unset DEBUG_NT
+
 cd /home/panlichen/work2/nccl-tests
 export LD_LIBRARY_PATH=/home/panlichen/work2/ofccl/build/lib
 export NCCL_PROTO=Simple
diff --git a/src/Makefile b/src/Makefile
index 8cee9d8..5927cc2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -7,7 +7,7 @@
 CUDA_HOME ?= /usr/local/cuda
 PREFIX ?= /usr/local
 VERBOSE ?= 0
-DEBUG ?= 1
+DEBUG_NT ?= 0
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -51,7 +51,7 @@ NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
 
-ifeq ($(DEBUG), 0)
+ifeq ($(DEBUG_NT), 0)
 NVCUFLAGS += -O3 -g
 CXXFLAGS  += -O3 -g
 else
diff --git a/src_inplace/Makefile b/src_inplace/Makefile
index 8b0e124..840c997 100644
--- a/src_inplace/Makefile
+++ b/src_inplace/Makefile
@@ -7,7 +7,7 @@
 CUDA_HOME ?= /usr/local/cuda
 PREFIX ?= /usr/local
 VERBOSE ?= 0
-DEBUG ?= 1
+DEBUG_NT ?= 0
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -51,7 +51,7 @@ NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
 
-ifeq ($(DEBUG), 0)
+ifeq ($(DEBUG_NT), 0)
 NVCUFLAGS += -O3 -g
 CXXFLAGS  += -O3 -g
 else
diff --git a/src_manual_size/Makefile b/src_manual_size/Makefile
index ce42152..363ce69 100644
--- a/src_manual_size/Makefile
+++ b/src_manual_size/Makefile
@@ -7,7 +7,7 @@
 CUDA_HOME ?= /usr/local/cuda
 PREFIX ?= /usr/local
 VERBOSE ?= 0
-DEBUG ?= 1
+DEBUG_NT ?= 0
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -51,7 +51,7 @@ NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
 
-ifeq ($(DEBUG), 0)
+ifeq ($(DEBUG_NT), 0)
 NVCUFLAGS += -O3 -g
 CXXFLAGS  += -O3 -g
 else
diff --git a/src_nccl_manual_size/Makefile b/src_nccl_manual_size/Makefile
index 4a67159..3851d9d 100644
--- a/src_nccl_manual_size/Makefile
+++ b/src_nccl_manual_size/Makefile
@@ -7,7 +7,7 @@
 CUDA_HOME ?= /usr/local/cuda
 PREFIX ?= /usr/local
 VERBOSE ?= 0
-DEBUG ?= 1
+DEBUG_NT ?= 0
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -51,7 +51,7 @@ NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
 
-ifeq ($(DEBUG), 0)
+ifeq ($(DEBUG_NT), 0)
 NVCUFLAGS += -O3 -g
 CXXFLAGS  += -O3 -g
 else
diff --git a/src_simple/Makefile b/src_simple/Makefile
index 1812202..2206f40 100644
--- a/src_simple/Makefile
+++ b/src_simple/Makefile
@@ -7,7 +7,7 @@
 CUDA_HOME ?= /usr/local/cuda
 PREFIX ?= /usr/local
 VERBOSE ?= 0
-DEBUG ?= 1
+DEBUG_NT ?= 0
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -51,7 +51,7 @@ NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
 LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
 NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
 
-ifeq ($(DEBUG), 0)
+ifeq ($(DEBUG_NT), 0)
 NVCUFLAGS += -O3 -g
 CXXFLAGS  += -O3 -g
 else
@@ -89,7 +89,7 @@ $(info CARDNAME $(NVCUFLAGS))
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter ofccl_reduce
+BIN_FILES_LIST := ofccl_all_reduce ofccl_all_gather ofccl_reduce_scatter ofccl_reduce ofccl_broadcast
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src_simple/ofccl_all_gather.cu b/src_simple/ofccl_all_gather.cu
index b22aab9..6cf8ddf 100644
--- a/src_simple/ofccl_all_gather.cu
+++ b/src_simple/ofccl_all_gather.cu
@@ -88,7 +88,7 @@ testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, int collId, CallBa
 
 testResult_t AllGatherPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
 
-  NCCLCHECK(ofcclPrepareAllGather(count, datatype, op, comm, collId, rankCtx));
+  NCCLCHECK(ofcclPrepareAllGather(count, datatype, comm, collId, rankCtx));
   // OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareAllGather with count=%lu, collId=%d", pthread_self(), count, collId);
   return testSuccess;
 }
diff --git a/src_simple/ofccl_broadcast.cu b/src_simple/ofccl_broadcast.cu
new file mode 100644
index 0000000..4a2b217
--- /dev/null
+++ b/src_simple/ofccl_broadcast.cu
@@ -0,0 +1,146 @@
+#include "cuda_runtime.h"
+#include "common_simple.h"
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+
+void print_header() {
+  PRINT("# %10s  %12s  %8s  %6s            out-of-place                       in-place          \n", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "root",
+        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
+  PRINT("# %10s  %12s  %8s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
+        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+}
+
+void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
+  PRINT("%12li  %12li  %8s  %6i", size, count, typeName, root);
+}
+
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  for (int i=0; i<args->nGpus; i++) {
+    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(gpuid));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
+    TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+int myCallback(int collIdFromCqe, void *args) {
+  // 不打log把这里删了，不然影响性能。
+  // if (collId != collIdFromCqe) {
+  //   // more robust error handle.
+  //   OFTEST_LOG(TEST_ERROR, "<%lu> Rank<%d>, collIdFromCqe(%d) is not expected(%d)", pthread_self(), cudaDev, collIdFromCqe, collId);
+  //   return -1;
+  // }
+  pthread_mutex_lock(&(((CallBackArgs *)args)->mutex));
+  ((CallBackArgs *)args)->gotCqe = 1;
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // int collId = ((CallBackArgs *)args)->collId;
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, callback get %dth cqe for coll_id = %d", pthread_self(), cudaDev, ((CallBackArgs *)args)->cqeCnt++, collId);
+
+  pthread_mutex_unlock(&(((CallBackArgs *)args)->mutex));
+  return 0;
+}
+
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, int collId, CallBackArgs *args, ofcclRankCtx_t rankCtx) {
+  args->collId = collId;
+  args->gotCqe = 0;
+  pthread_mutex_init(&args->mutex, NULL);
+  NCCLCHECK(ofcclRunBroadcast(sendbuff, recvbuff, collId, myCallback, args, rankCtx));
+
+  // int cudaDev;
+  // CUDACHECK(cudaGetDevice(&cudaDev));
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunBroadcast for coll_id = %d with args @ %p", pthread_self(), cudaDev, collId, args);
+  // OFTEST_LOG(TEST, "<%lu> Rank<%d>, invoke ofcclRunBroadcast sendbuff @ %p, recvbuff @ %p", pthread_self(), cudaDev, sendbuff, recvbuff);
+  
+  return testSuccess;
+}
+
+testResult_t BroadcastPrepare(size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm* comm, int collId, ofcclRankCtx_t rankCtx) {
+
+  NCCLCHECK(ofcclPrepareBroadcast(count, datatype, root, comm, collId, rankCtx));
+  OFTEST_LOG(TEST, "tid<%lu> invoke ofcclPrepareBroadcast with count=%lu, collId=%d", pthread_self(), count, collId);
+  return testSuccess;
+}
+
+struct testColl broadcastTest = {
+  "Broadcast",
+  BroadcastGetCollByteCount,
+  BroadcastInitData,
+  BroadcastGetBw,
+  BroadcastRunColl,
+  BroadcastPrepare
+};
+
+void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &broadcastTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine broadcastEngine = {
+  BroadcastGetBuffSize,
+  BroadcastRunTest
+};
+
+#pragma weak ncclTestEngine=broadcastEngine
+
+

From 37b2463f1a764a9058fa11024d6499e2227fe670 Mon Sep 17 00:00:00 2001
From: novaCoder-zrk <zhangrongkai2000@163.com>
Date: Thu, 19 Jan 2023 03:28:17 +0000
Subject: [PATCH 107/109] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E5=89=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py | 285 +++++++++++++++++++++++---------------
 1 file changed, 177 insertions(+), 108 deletions(-)

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index ac5f5f7..566ff3e 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -7,8 +7,8 @@
 font.height = 20*16
 style.font = font
 # 设置环境变量
-os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib"
-# os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib"
+#os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/work2/ofccl/build/lib"
+os.environ['LD_LIBRARY_PATH'] = "/home/panlichen/zrk/work/ofccl/build/lib"
 os.environ['NCCL_PROTO'] = "Simple"
 os.environ['NCCL_ALGO'] = "RING"
 
@@ -21,19 +21,19 @@
 # 设置超参数
 runNcclTest = True # 运行nccl测试,仅输出原始结果
 staticNccl = True # 运行统计，输出中间结果
-collectNcclResult  = True # 收集nccl测试结果，写入xls
+collectNcclResult  = False # 收集nccl测试结果，写入xls
 
 
-runOfcclTest = True# 运行ofccl测试
-staticOfccl = True # 运行统计，输出中间结果
-staticOfcclExtral = True # 对ofccl的额外输出进行统计
-collectOfcclResult = True# 收集ofccl测试结果，写入xls
+runOfcclTest = False# 运行ofccl测试
+staticOfccl = False # 运行统计，输出中间结果
+staticOfcclExtral = False # 对ofccl的额外输出进行统计
+collectOfcclResult = False# 收集ofccl测试结果，写入xls
 
 buffer_sizes = ["64", "128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]
 
 TINY_TEST = 0
 
-DATE="230114"
+DATE="230118"
 NCCL_ORDER="1"
 host=os.environ.get("HOST")
 n = 5
@@ -67,13 +67,29 @@
 
 
 table = xlwt.Workbook()
-bwSheet = table.add_sheet('bw')
-tmSheet = table.add_sheet('time')
-cntSheet = table.add_sheet('totalCnt')
+AR['bwSheet'] = table.add_sheet('allReduce_bw')
+AR['tmShee'] = table.add_sheet('allReduce_time')
+AR['cntSheet'] = table.add_sheet('allReduce_totalCnt')
+
+AG['bwSheet'] = table.add_sheet('allGather_bw')
+AG['tmSheet'] = table.add_sheet('allGather_time')
+AG['cntSheet'] = table.add_sheet('allGather_totalCnt')
+
+B['bwSheet'] = table.add_sheet('broadcast_bw')
+B['tmSheet'] = table.add_sheet('broadcast_time')
+B['cntSheet'] = table.add_sheet('broadcast_totalCnt')
+
+R['bwSheet'] = table.add_sheet('reduce_bw')
+R['tmSheet'] = table.add_sheet('reduce_time')
+R['cntSheet'] = table.add_sheet('reduce_totalCnt')
+
+RS['bwSheet'] = table.add_sheet('reduceScatter_bw')
+RS['tmSheet'] = table.add_sheet('reduceScatter_time')
+RS['cntSheet'] = table.add_sheet('reduceScatter_totalCnt')
 # 列宽
-for i in range(30):
-    bwSheet.col(i).width = 13 * 256
-    tmSheet.col(i).width = 16 * 256
+# for i in range(30):
+#     AR['bwSheet'].col(i).width = 13 * 256
+#     AR_tmSheet.col(i).width = 16 * 256
 
 cnt  = 0
 for MY_NUM_DEV in ncards:
@@ -87,69 +103,122 @@
     NCCL_RES_DIR ="./nccl/test_result_"+DATE+"_"+NCCL_ORDER+"_"+str(MY_NUM_DEV)+"cards"
     if not os.path.exists(NCCL_RES_DIR):
         os.makedirs(NCCL_RES_DIR)
-    # 统计结果    
-    NCCL_OUTPUT_BW_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards.txt"  
-    NCCL_OUTPUT_TIME_PATH=NCCL_RES_DIR+"/result_statics_nccl_"+str(MY_NUM_DEV)+"cards_time.txt"   
-    
+    # 统计结果  
+    # allReduce  
+    AR['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_allReduce_"+str(MY_NUM_DEV)+"cards.txt"  
+    AR['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_allReduce_"+str(MY_NUM_DEV)+"cards_time.txt"   
+    # allGather
+    AG['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_allGather_"+str(MY_NUM_DEV)+"cards.txt"  
+    AG['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_allGather_"+str(MY_NUM_DEV)+"cards_time.txt"   
+    # broadcast
+    B['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_broadcast_"+str(MY_NUM_DEV)+"cards.txt"  
+    B['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_broadcast_"+str(MY_NUM_DEV)+"cards_time.txt"   
+    # reduce
+    R['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_reduce_"+str(MY_NUM_DEV)+"cards.txt"  
+    R['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_reduce_"+str(MY_NUM_DEV)+"cards_time.txt"   
+    # reduceScatter
+    RS['nccl_bw_path']=NCCL_RES_DIR+"/result_nccl_reduceScatter_"+str(MY_NUM_DEV)+"cards.txt"  
+    RS['nccl_time_path']=NCCL_RES_DIR+"/result_nccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_time.txt"      
 
     if staticNccl == True:
+        for op in [AR,AG,B,R,RS]:
+            os.system("echo  $(date +%F%n%T)>>"+op['nccl_bw_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['nccl_time_path'])
+
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_AG_BW_PATH)
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_AG_TIME_PATH)
+
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_B_BW_PATH)
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_B_TIME_PATH)
+
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_R_BW_PATH)
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_R_TIME_PATH)
+
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_RS_BW_PATH)
+        os.system("echo  $(date +%F%n%T)>>"+NCCL_RS_TIME_PATH)
 
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_OUTPUT_BW_PATH)
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_OUTPUT_TIME_PATH)
 
     for iter in [1,2,3]:
-        NCCL_RES_PATH = NCCL_RES_DIR+"/nccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        # raw data
+        NCCL_AR = NCCL_RES_DIR+"/nccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        NCCL_AG = NCCL_RES_DIR+"/nccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        NCCL_B = NCCL_RES_DIR+"/nccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        NCCL_R = NCCL_RES_DIR+"/nccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        NCCL_RS = NCCL_RES_DIR+"/nccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+
         if runNcclTest:
-            os.system("echo $(date +%F%n%T)>> "+NCCL_RES_PATH)
+            os.system("echo $(date +%F%n%T)>> "+NCCL_AR)
+            os.system("echo $(date +%F%n%T)>> "+NCCL_AG)
+            os.system("echo $(date +%F%n%T)>> "+NCCL_B)
+            os.system("echo $(date +%F%n%T)>> "+NCCL_R)
+            os.system("echo $(date +%F%n%T)>> "+NCCL_RS)
+
             for a in buffer_sizes:
-                os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RES_PATH)
+                os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_AR)
+                os.system("../build/all_gather_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_AG)
+                os.system("../build/broadcast_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_B)
+                os.system("../build/reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_R)
+                os.system("../build/reduce_scatter_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RS)
+
         if staticNccl:    
-            os.system("./nccl/static_nccl.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_BW_PATH+" "+str(MY_NUM_DEV)) 
-            os.system("./nccl/static_time.out " +NCCL_RES_PATH+" " +NCCL_OUTPUT_TIME_PATH+" "+str(MY_NUM_DEV)) 
+            os.system("./nccl/static_nccl.out " +NCCL_AR+" " +NCCL_AR_BW_PATH+" "+str(MY_NUM_DEV)) 
+            os.system("./nccl/static_time.out " +NCCL_AR+" " +NCCL_AR_TIME_PATH+" "+str(MY_NUM_DEV))
+
+            os.system("./nccl/static_nccl.out " +NCCL_AG+" " +NCCL_AG_BW_PATH+" "+str(MY_NUM_DEV)) 
+            os.system("./nccl/static_time.out " +NCCL_AG+" " +NCCL_AG_TIME_PATH+" "+str(MY_NUM_DEV))  
+
+            os.system("./nccl/static_nccl.out " +NCCL_B+" " +NCCL_B_BW_PATH+" "+str(MY_NUM_DEV)) 
+            os.system("./nccl/static_time.out " +NCCL_B+" " +NCCL_B_TIME_PATH+" "+str(MY_NUM_DEV)) 
+
+            os.system("./nccl/static_nccl.out " +NCCL_R+" " +NCCL_R_BW_PATH+" "+str(MY_NUM_DEV)) 
+            os.system("./nccl/static_time.out " +NCCL_R+" " +NCCL_R_TIME_PATH+" "+str(MY_NUM_DEV)) 
+
+            os.system("./nccl/static_nccl.out " +NCCL_RS+" " +NCCL_RS_BW_PATH+" "+str(MY_NUM_DEV)) 
+            os.system("./nccl/static_time.out " +NCCL_RS+" " +NCCL_RS_TIME_PATH+" "+str(MY_NUM_DEV)) 
                    
     if collectNcclResult == True :
         # bus
-        bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+        AR_bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
 
-        with open(NCCL_OUTPUT_BW_PATH) as f:
+        with open(NCCL_AR_BW_PATH) as f:
             content = f.read()
         bw = content.split()
 
         axis_y =  buffer_sizes
         for a in range(0,25):
-            bwSheet.write(2+a+cnt*30,0,axis_y[a],style)                 
+            AR_bwSheet.write(2+a+cnt*30,0,axis_y[a],style)                 
         #
         for k in [0,1,2]:
-            bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k),style)
+            AR_bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k),style)
             for i in range(0,25):
-                bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2],style)
+                AR_bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2],style)
 
-            bwSheet.write(1+cnt*30,12+k,'nccl-busbw'+str(k),style)
+            AR_bwSheet.write(1+cnt*30,12+k,'nccl-busbw'+str(k),style)
             for i in range(0,25):
-                bwSheet.write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style)
+                AR_bwSheet.write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style)
         # avg
-        bwSheet.write(1+cnt*30, 4, 'avg-algbw',style)
-        bwSheet.write(1+cnt*30, 15, 'avg-busbw',style)
+        AR_bwSheet.write(1+cnt*30, 4, 'avg-algbw',style)
+        AR_bwSheet.write(1+cnt*30, 15, 'avg-busbw',style)
         for i in range(0,25):
-            bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style )
-            bwSheet.write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) 
+            AR_bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style )
+            AR_bwSheet.write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) 
         
         # time  
-        with open(NCCL_OUTPUT_TIME_PATH) as f2:
+        with open(NCCL_AR_TIME_PATH) as f2:
             content2 = f2.read()
         times = content2.split()
 
-        tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+        AR_tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
         for a in range(0,25):
-            tmSheet.write(2+a+cnt*30,0,axis_y[a],style)
+            AR_tmSheet.write(2+a+cnt*30,0,axis_y[a],style)
         for k in [0,1,2]:
-            tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k),style)
+            AR_tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k),style)
             for i in range(0,25):
-                tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2],style)
+                AR_tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2],style)
         # avg 
-        tmSheet.write(1+cnt*30, 4, 'avg-nccl',style)
+        AR_tmSheet.write(1+cnt*30, 4, 'avg-nccl',style)
         for i in range(0,25):
-            tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style)
+            AR_tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style)
         
 
     #OFCCL      
@@ -193,19 +262,19 @@
         bw = content2.split()
         #bus        
         for k in [0,1,2]:
-            bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style)
+            AR_bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style)
             for i in range(0,25):
-                bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2],style)
+                AR_bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2],style)
 
-            bwSheet.write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style)
+            AR_bwSheet.write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style)
             for i in range(0,25):
-                bwSheet.write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style)
+                AR_bwSheet.write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style)
         # avg
-        bwSheet.write(1+cnt*30,8, 'avg-algbw',style)
-        bwSheet.write(1+cnt*30, 19, 'avg-busbw',style)
+        AR_bwSheet.write(1+cnt*30,8, 'avg-algbw',style)
+        AR_bwSheet.write(1+cnt*30, 19, 'avg-busbw',style)
         for i in range(0,25):
-            bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
-            bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) 
+            AR_bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
+            AR_bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) 
         
         # time  
         with open(OFCCL_OUTPUT_TIME_PATH) as f2:
@@ -213,44 +282,44 @@
         times = content2.split()
 
         for k in [0,1,2]:
-            tmSheet.write(1+cnt*30,5+k,'ofccl-'+str(k),style)
+            AR_tmSheet.write(1+cnt*30,5+k,'ofccl-'+str(k),style)
             for i in range(0,25):
-                tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2],style)
+                AR_tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2],style)
         # avg 
-        tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style)
+        AR_tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style)
         for i in range(0,25):
-            tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
+            AR_tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
 
     if collectNcclResult and collectOfcclResult:
-        bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style)
-        bwSheet.write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style)
-        tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style)
-        tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style)
+        AR_bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style)
+        AR_bwSheet.write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style)
+        AR_tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style)
+        AR_tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style)
         for i in range(0,25):
-            bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style)
-            bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style)
-            tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style )
-            tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style )
+            AR_bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style)
+            AR_bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style)
+            AR_tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style )
+            AR_tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style )
 
     # time 各个列的标题
     if staticOfcclExtral:
-        tmSheet.write(1+cnt*30, 13,'nccl IO',style )
-        tmSheet.write(1+cnt*30, 14,'nccl kern',style )
-        tmSheet.write(1+cnt*30, 15,'ofccl-nccl kern',style )
-        tmSheet.write(1+cnt*30, 16,'before after get sqe',style )
-        tmSheet.write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style )
-        tmSheet.write(1+cnt*30, 18,'before after put cqe',style )
-        tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style )
-        tmSheet.write(1+cnt*30, 20,'occl rank0 time',style )
-        tmSheet.write(1+cnt*30, 21,'nccl kern ori',style )
-        tmSheet.write(1+cnt*30, 27,'before after get sqe ori',style )
-        tmSheet.write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style )
-        tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style )
-        tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style )
+        AR_tmSheet.write(1+cnt*30, 13,'nccl IO',style )
+        AR_tmSheet.write(1+cnt*30, 14,'nccl kern',style )
+        AR_tmSheet.write(1+cnt*30, 15,'ofccl-nccl kern',style )
+        AR_tmSheet.write(1+cnt*30, 16,'before after get sqe',style )
+        AR_tmSheet.write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style )
+        AR_tmSheet.write(1+cnt*30, 18,'before after put cqe',style )
+        AR_tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style )
+        AR_tmSheet.write(1+cnt*30, 20,'occl rank0 time',style )
+        AR_tmSheet.write(1+cnt*30, 21,'nccl kern ori',style )
+        AR_tmSheet.write(1+cnt*30, 27,'before after get sqe ori',style )
+        AR_tmSheet.write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style )
+        AR_tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style )
+        AR_tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style )
 
         y = 64
         for i in range(0,25):
-            tmSheet.write(2+i+cnt*30,12,y,style)
+            AR_tmSheet.write(2+i+cnt*30,12,y,style)
             y = y*2    
 
         with open(OFCCL_OUTPUT_QE_PATH) as f3:
@@ -260,64 +329,64 @@
             content4 = f4.read()
         times4 = content4.split()
         for i in range(0,25):
-            tmSheet.write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
-            tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style )
-            tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
-            tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style)
-            tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style)
-            tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style)
-            tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style)
-            tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style)
+            AR_tmSheet.write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
+            AR_tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style )
+            AR_tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
+            AR_tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style)
+            AR_tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style)
+            AR_tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style)
+            AR_tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style)
+            AR_tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style)
             for j in range(0,5):
-                tmSheet.write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style)
-                tmSheet.write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style)
-                tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style)
-                tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style)
+                AR_tmSheet.write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style)
+                AR_tmSheet.write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style)
+                AR_tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style)
+                AR_tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style)
 
-        # cntsheet
-        cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+        # AR_cntSheet
+        AR_cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
         axis_y =  buffer_sizes
         for a in range(0,25):
-            cntSheet.write(2+a+cnt*30,0,axis_y[a],style)
+            AR_cntSheet.write(2+a+cnt*30,0,axis_y[a],style)
 
-        cntSheet.write(1+cnt*30,1,"totalCtxSaveCnt_avg",style)
-        cntSheet.write(1+cnt*30,2,"totalCtxLoadCnt_avg",style)
-        cntSheet.write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style)
-        cntSheet.write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style)
-        cntSheet.write(1+cnt*30,6,"totalCtxSaveCnt",style)
-        cntSheet.write(1+cnt*30,24,"totalCtxLoadCnt",style)
-        cntSheet.write(1+cnt*30,42,"totalProgressed7SwithchCnt",style)
-        cntSheet.write(1+cnt*30,60,"totalUnprogressedQuitCnt",style)
+        AR_cntSheet.write(1+cnt*30,1,"totalCtxSaveCnt_avg",style)
+        AR_cntSheet.write(1+cnt*30,2,"totalCtxLoadCnt_avg",style)
+        AR_cntSheet.write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style)
+        AR_cntSheet.write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style)
+        AR_cntSheet.write(1+cnt*30,6,"totalCtxSaveCnt",style)
+        AR_cntSheet.write(1+cnt*30,24,"totalCtxLoadCnt",style)
+        AR_cntSheet.write(1+cnt*30,42,"totalProgressed7SwithchCnt",style)
+        AR_cntSheet.write(1+cnt*30,60,"totalUnprogressedQuitCnt",style)
 
         with  open(OFCCL_OUTPUT_TOTALCNT_PATH) as f:
             line = f.readline()
             # save
             for i in range(0,25): 
                 numbers = line.split()
-                cntSheet.write(i+2+cnt*30,1,numbers[0])
+                AR_cntSheet.write(i+2+cnt*30,1,numbers[0])
                 for j in range(1,len(numbers)):
-                    cntSheet.write(i+2+cnt*30,5+j,numbers[j])
+                    AR_cntSheet.write(i+2+cnt*30,5+j,numbers[j])
                 line = f.readline()
             # load
             for i in range(0,25): 
                 numbers = line.split()
-                cntSheet.write(i+2+cnt*30,2,numbers[0])
+                AR_cntSheet.write(i+2+cnt*30,2,numbers[0])
                 for j in range(1,len(numbers)):
-                    cntSheet.write(i+2+cnt*30,23+j,numbers[j])
+                    AR_cntSheet.write(i+2+cnt*30,23+j,numbers[j])
                 line = f.readline()
             # totalProgressed7SwithchCnt
             for i in range(0,25): 
                 numbers = line.split()
-                cntSheet.write(i+2+cnt*30,3,numbers[0])
+                AR_cntSheet.write(i+2+cnt*30,3,numbers[0])
                 for j in range(1,len(numbers)):
-                    cntSheet.write(i+2+cnt*30,41+j,numbers[j])
+                    AR_cntSheet.write(i+2+cnt*30,41+j,numbers[j])
                 line = f.readline()
             # totalUnprogressedQuitCnt
             for i in range(0,25): 
                 numbers = line.split()
-                cntSheet.write(i+2+cnt*30,4,numbers[0])
+                AR_cntSheet.write(i+2+cnt*30,4,numbers[0])
                 for j in range(1,len(numbers)):
-                    cntSheet.write(i+2+cnt*30,59+j,numbers[j])
+                    AR_cntSheet.write(i+2+cnt*30,59+j,numbers[j])
                 line = f.readline()
 
 

From d81c2572d9fbb4b84f6be1dd18f581fbb5bed273 Mon Sep 17 00:00:00 2001
From: novaCoder-zrk <zhangrongkai2000@163.com>
Date: Fri, 20 Jan 2023 09:44:42 +0000
Subject: [PATCH 108/109] =?UTF-8?q?=E6=B5=8B=E8=AF=95=20=E4=BA=94=E7=A7=8D?=
 =?UTF-8?q?=E6=93=8D=E4=BD=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py                    | 527 ++++++++++---------
 test_scripts/nccl/static_nccl.cpp            |  51 +-
 test_scripts/nccl/static_time.cpp            |  44 +-
 test_scripts/ofccl/static_ofccl_bw_order.cpp |  46 ++
 test_scripts/ofccl/static_ofccl_tm_order.cpp |  35 ++
 5 files changed, 416 insertions(+), 287 deletions(-)
 create mode 100644 test_scripts/ofccl/static_ofccl_bw_order.cpp
 create mode 100644 test_scripts/ofccl/static_ofccl_tm_order.cpp

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 566ff3e..26536d5 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -19,15 +19,15 @@
 os.environ['DEV_TRY_ROUND'] = "10"
 
 # 设置超参数
-runNcclTest = True # 运行nccl测试,仅输出原始结果
-staticNccl = True # 运行统计，输出中间结果
-collectNcclResult  = False # 收集nccl测试结果，写入xls
+runNcclTest = False # 运行nccl测试,仅输出原始结果
+staticNccl = False # 运行统计，输出中间结果
+collectNcclResult  = True# 收集nccl测试结果，写入xls
 
 
-runOfcclTest = False# 运行ofccl测试
-staticOfccl = False # 运行统计，输出中间结果
-staticOfcclExtral = False # 对ofccl的额外输出进行统计
-collectOfcclResult = False# 收集ofccl测试结果，写入xls
+runOfcclTest = True# 运行ofccl测试
+staticOfccl = True # 运行统计，输出中间结果
+staticOfcclExtral = True# 对ofccl的额外输出进行统计
+collectOfcclResult = True# 收集ofccl测试结果，写入xls
 
 buffer_sizes = ["64", "128", "256", "512", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G"]
 
@@ -53,7 +53,8 @@
     collectNcclResult  = False # 收集nccl测试结果，写入xls
     ncards = [2]
     # buffer_sizes = ["64", "128", "256", "512", "1K"]
-
+NCCL_TIER=[1,2,3]
+OFCCL_ITER=[1,2,3,4,5,6]
 resultXlsName=host+"_"+DATE+"_"+NCCL_ORDER+"_M"+str(m)+"n"+str(n)+"w"+str(w)+".xls"
 
 # static 
@@ -64,28 +65,44 @@
 os.system("g++ ./ofccl/static_ofccl_QE.cpp -o ./ofccl/static_ofccl_QE.out")
 os.system("g++ ./ofccl/static_ofccl_QE_ori.cpp -o ./ofccl/static_ofccl_QE_ori.out")
 os.system("g++ ./ofccl/static_ofccl_totalCnt.cpp -o ./ofccl/static_ofccl_totalCnt.out")
-
+os.system("g++ ./ofccl/static_ofccl_bw_order.cpp -o ./ofccl/static_ofccl_bw_order.out ")
+os.system("g++ ./ofccl/static_ofccl_tm_order.cpp -o ./ofccl/static_ofccl_tm_order.out ")
+AR = {}
+AG = {}
+B = {}
+R = {}
+RS = {}
 
 table = xlwt.Workbook()
 AR['bwSheet'] = table.add_sheet('allReduce_bw')
-AR['tmShee'] = table.add_sheet('allReduce_time')
+AR['tmSheet'] = table.add_sheet('allReduce_time')
 AR['cntSheet'] = table.add_sheet('allReduce_totalCnt')
+AR['run'] = "../build/all_reduce_perf"
+AR['runOfccl'] = "../build/ofccl_all_reduce_perf"
 
 AG['bwSheet'] = table.add_sheet('allGather_bw')
 AG['tmSheet'] = table.add_sheet('allGather_time')
 AG['cntSheet'] = table.add_sheet('allGather_totalCnt')
+AG['run'] = "../build/all_gather_perf"
+AG['runOfccl'] = "../build/ofccl_all_gather_perf"
 
 B['bwSheet'] = table.add_sheet('broadcast_bw')
 B['tmSheet'] = table.add_sheet('broadcast_time')
 B['cntSheet'] = table.add_sheet('broadcast_totalCnt')
+B['run'] = "../build/broadcast_perf"
+B['runOfccl']="../build/ofccl_broadcast_perf"
 
 R['bwSheet'] = table.add_sheet('reduce_bw')
 R['tmSheet'] = table.add_sheet('reduce_time')
 R['cntSheet'] = table.add_sheet('reduce_totalCnt')
+R['run'] = "../build/reduce_perf"
+R['runOfccl']= "../build/ofccl_reduce_perf"
 
 RS['bwSheet'] = table.add_sheet('reduceScatter_bw')
 RS['tmSheet'] = table.add_sheet('reduceScatter_time')
 RS['cntSheet'] = table.add_sheet('reduceScatter_totalCnt')
+RS['run'] = "../build/reduce_scatter_perf"
+RS['runOfccl'] = "../build/ofccl_reduce_scatter_perf"
 # 列宽
 # for i in range(30):
 #     AR['bwSheet'].col(i).width = 13 * 256
@@ -125,100 +142,73 @@
             os.system("echo  $(date +%F%n%T)>>"+op['nccl_bw_path'])
             os.system("echo  $(date +%F%n%T)>>"+op['nccl_time_path'])
 
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_AG_BW_PATH)
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_AG_TIME_PATH)
 
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_B_BW_PATH)
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_B_TIME_PATH)
-
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_R_BW_PATH)
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_R_TIME_PATH)
+    for iter in NCCL_TIER:
+        # raw data
+        AR['nccl_rawData'] = NCCL_RES_DIR+"/nccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        AG['nccl_rawData'] = NCCL_RES_DIR+"/nccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        B['nccl_rawData'] = NCCL_RES_DIR+"/nccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        R['nccl_rawData'] = NCCL_RES_DIR+"/nccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        RS['nccl_rawData'] = NCCL_RES_DIR+"/nccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
 
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_RS_BW_PATH)
-        os.system("echo  $(date +%F%n%T)>>"+NCCL_RS_TIME_PATH)
+        if runNcclTest:
+            for op in [AR,AG,B,R,RS]:
+                os.system("echo $(date +%F%n%T)>> "+op['nccl_rawData'])
 
+                for a in buffer_sizes:
+                    os.system(op['run']+" -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ op['nccl_rawData'])
 
-    for iter in [1,2,3]:
-        # raw data
-        NCCL_AR = NCCL_RES_DIR+"/nccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
-        NCCL_AG = NCCL_RES_DIR+"/nccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
-        NCCL_B = NCCL_RES_DIR+"/nccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
-        NCCL_R = NCCL_RES_DIR+"/nccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
-        NCCL_RS = NCCL_RES_DIR+"/nccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        if staticNccl:
+            for op in [AR,AG,B,R,RS]:    
+                os.system("./nccl/static_nccl.out " +op['nccl_rawData'] +" " +op['nccl_bw_path']) 
+                os.system("./nccl/static_time.out " +op['nccl_rawData'] +" " +op['nccl_time_path'])
 
-        if runNcclTest:
-            os.system("echo $(date +%F%n%T)>> "+NCCL_AR)
-            os.system("echo $(date +%F%n%T)>> "+NCCL_AG)
-            os.system("echo $(date +%F%n%T)>> "+NCCL_B)
-            os.system("echo $(date +%F%n%T)>> "+NCCL_R)
-            os.system("echo $(date +%F%n%T)>> "+NCCL_RS)
-
-            for a in buffer_sizes:
-                os.system("../build/all_reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_AR)
-                os.system("../build/all_gather_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_AG)
-                os.system("../build/broadcast_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_B)
-                os.system("../build/reduce_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_R)
-                os.system("../build/reduce_scatter_perf -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -m "+str(m) +" >>"+ NCCL_RS)
-
-        if staticNccl:    
-            os.system("./nccl/static_nccl.out " +NCCL_AR+" " +NCCL_AR_BW_PATH+" "+str(MY_NUM_DEV)) 
-            os.system("./nccl/static_time.out " +NCCL_AR+" " +NCCL_AR_TIME_PATH+" "+str(MY_NUM_DEV))
-
-            os.system("./nccl/static_nccl.out " +NCCL_AG+" " +NCCL_AG_BW_PATH+" "+str(MY_NUM_DEV)) 
-            os.system("./nccl/static_time.out " +NCCL_AG+" " +NCCL_AG_TIME_PATH+" "+str(MY_NUM_DEV))  
-
-            os.system("./nccl/static_nccl.out " +NCCL_B+" " +NCCL_B_BW_PATH+" "+str(MY_NUM_DEV)) 
-            os.system("./nccl/static_time.out " +NCCL_B+" " +NCCL_B_TIME_PATH+" "+str(MY_NUM_DEV)) 
-
-            os.system("./nccl/static_nccl.out " +NCCL_R+" " +NCCL_R_BW_PATH+" "+str(MY_NUM_DEV)) 
-            os.system("./nccl/static_time.out " +NCCL_R+" " +NCCL_R_TIME_PATH+" "+str(MY_NUM_DEV)) 
-
-            os.system("./nccl/static_nccl.out " +NCCL_RS+" " +NCCL_RS_BW_PATH+" "+str(MY_NUM_DEV)) 
-            os.system("./nccl/static_time.out " +NCCL_RS+" " +NCCL_RS_TIME_PATH+" "+str(MY_NUM_DEV)) 
+            
                    
-    if collectNcclResult == True :
-        # bus
-        AR_bwSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
-
-        with open(NCCL_AR_BW_PATH) as f:
-            content = f.read()
-        bw = content.split()
-
-        axis_y =  buffer_sizes
-        for a in range(0,25):
-            AR_bwSheet.write(2+a+cnt*30,0,axis_y[a],style)                 
-        #
-        for k in [0,1,2]:
-            AR_bwSheet.write(1+cnt*30,1+k,'nccl-algbw'+str(k),style)
-            for i in range(0,25):
-                AR_bwSheet.write(2+i+cnt*30,1+k,bw[i+k*50+2],style)
-
-            AR_bwSheet.write(1+cnt*30,12+k,'nccl-busbw'+str(k),style)
+    if collectNcclResult :
+        for op in [AR,AG,B,R,RS]:
+            # bus
+            op['bwSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+
+            with open(op['nccl_bw_path']) as f:
+                content = f.read()
+            bw = content.split()
+
+            axis_y =  buffer_sizes
+            for a in range(0,25):
+                op['bwSheet'].write(2+a+cnt*30,0,axis_y[a],style)                 
+            #
+            for k in [0,1,2]:
+                op['bwSheet'].write(1+cnt*30,1+k,'nccl-algbw'+str(k),style)
+                for i in range(0,25):
+                    op['bwSheet'].write(2+i+cnt*30,1+k,bw[i+k*50+2],style)
+
+                op['bwSheet'].write(1+cnt*30,12+k,'nccl-busbw'+str(k),style)
+                for i in range(0,25):
+                    op['bwSheet'].write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style)
+            # avg
+            op['bwSheet'].write(1+cnt*30, 4, 'avg-algbw',style)
+            op['bwSheet'].write(1+cnt*30, 15, 'avg-busbw',style)
             for i in range(0,25):
-                AR_bwSheet.write(2+i+cnt*30,12+k,bw[i+k*50+25+2],style)
-        # avg
-        AR_bwSheet.write(1+cnt*30, 4, 'avg-algbw',style)
-        AR_bwSheet.write(1+cnt*30, 15, 'avg-busbw',style)
-        for i in range(0,25):
-            AR_bwSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style )
-            AR_bwSheet.write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) 
-        
-        # time  
-        with open(NCCL_AR_TIME_PATH) as f2:
-            content2 = f2.read()
-        times = content2.split()
-
-        AR_tmSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
-        for a in range(0,25):
-            AR_tmSheet.write(2+a+cnt*30,0,axis_y[a],style)
-        for k in [0,1,2]:
-            AR_tmSheet.write(1+cnt*30,1+k,'nccl-'+str(k),style)
+                op['bwSheet'].write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'),style )
+                op['bwSheet'].write(2+i+cnt*30, 15, xlwt.Formula('SUM(M'+str(2+i+cnt*30+1)+',N'+str(2+i+cnt*30+1)+',O'+str(2+i+cnt*30+1)+')/3'),style) 
+            
+            # time  
+            with open(op['nccl_time_path']) as f2:
+                content2 = f2.read()
+            times = content2.split()
+
+            op['tmSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+            for a in range(0,25):
+                op['tmSheet'].write(2+a+cnt*30,0,axis_y[a],style)
+            for k in [0,1,2]:
+                op['tmSheet'].write(1+cnt*30,1+k,'nccl-'+str(k),style)
+                for i in range(0,25):
+                    op['tmSheet'].write(2+i+cnt*30,1+k,times[i+k*25+2],style)
+            # avg 
+            op['tmSheet'].write(1+cnt*30, 4, 'avg-nccl',style)
             for i in range(0,25):
-                AR_tmSheet.write(2+i+cnt*30,1+k,times[i+k*25+2],style)
-        # avg 
-        AR_tmSheet.write(1+cnt*30, 4, 'avg-nccl',style)
-        for i in range(0,25):
-            AR_tmSheet.write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style)
+                op['tmSheet'].write(2+i+cnt*30, 4, xlwt.Formula('SUM(B'+str(2+i+cnt*30+1)+',C'+str(2+i+cnt*30+1)+',D'+str(2+i+cnt*30+1)+')/3'), style)
         
 
     #OFCCL      
@@ -227,167 +217,222 @@
     if not os.path.exists(OFCCL_RES_DIR):
         os.makedirs(OFCCL_RES_DIR)
     # 统计结果    
-    OFCCL_OUTPUT_BW_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards.txt"  
-    OFCCL_OUTPUT_TIME_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_time.txt"  
-    OFCCL_OUTPUT_QE_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE.txt"  
-    OFCCL_OUTPUT_QE_ORI_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
-    OFCCL_OUTPUT_TOTALCNT_PATH=OFCCL_RES_DIR+"/result_statics_ofccl_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
-
-    if staticOfccl == True: 
-        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_BW_PATH)
-        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_TIME_PATH)
+    AR['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards.txt"  
+    AR['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_order_"+str(MY_NUM_DEV)+"cards.txt"  
+    AR['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    AR['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_order_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    AR['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    AR['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    AR['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_allReduce_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
+
+    AG['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards.txt"  
+    AG['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_order_"+str(MY_NUM_DEV)+"cards.txt"
+    AG['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    AG['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_order_"+str(MY_NUM_DEV)+"cards_time.txt"
+    AG['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    AG['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    AG['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_allGather_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
+
+    B['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards.txt" 
+    B['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_order_"+str(MY_NUM_DEV)+"cards.txt"  
+    B['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    B['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_order_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    B['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    B['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    B['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_broadcast_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
+
+    R['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards.txt"  
+    R['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_order_"+str(MY_NUM_DEV)+"cards.txt" 
+    R['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    R['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_order_"+str(MY_NUM_DEV)+"cards_time.txt" 
+    R['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    R['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    R['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_reduce_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"
+
+    RS['ofccl_bw_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards.txt"  
+    RS['ofccl_bw_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_order_"+str(MY_NUM_DEV)+"cards.txt" 
+    RS['ofccl_tm_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_time.txt" 
+    RS['ofccl_tm_order_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_order_"+str(MY_NUM_DEV)+"cards_time.txt"  
+    RS['ofccl_qe_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_QE.txt"  
+    RS['ofccl_qeOri_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_QE_ori.txt" 
+    RS['ofccl_totalCnt_path']=OFCCL_RES_DIR+"/result_ofccl_reduceScatter_"+str(MY_NUM_DEV)+"cards_totalCnt.txt"   
+
+    if staticOfccl: 
+        for op in [AR,AG,B,R,RS]:
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_bw_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_tm_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_bw_order_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_tm_order_path'])
     if staticOfcclExtral:
-        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_PATH)
-        os.system("echo  $(date +%F%n%T)>>"+OFCCL_OUTPUT_QE_ORI_PATH)    
+        for op in [AR,AG,B,R,RS]:
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_qe_path'])
+            os.system("echo  $(date +%F%n%T)>>"+op['ofccl_qeOri_path'])
+       
 
-    for iter in [1,2,3]:
-        OFCCL_RES_PATH = OFCCL_RES_DIR+"/ofccl_result_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_M"+str(M)+".txt"
+    for iter in OFCCL_ITER:
+        # raw data
+        AR['ofccl_rawData'] =  OFCCL_RES_DIR+"/ofccl_allReduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        AG['ofccl_rawData'] =OFCCL_RES_DIR+"/ofccl_allGather_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        B['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_broadcast_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        R['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_reduce_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+        RS['ofccl_rawData'] = OFCCL_RES_DIR+"/ofccl_reduceScatter_"+str(iter)+"_n"+str(n)+"_w"+str(w)+"_m"+str(m)+".txt"
+       
         if runOfcclTest:
-            os.system("echo $(date +%F%n%T)>> "+OFCCL_RES_PATH)
-            for a in buffer_sizes:
-                os.system("../build/ofccl_all_reduce_perf  -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ OFCCL_RES_PATH)
+            for op in [AR,AG,B,R,RS]:
+                os.system("echo $(date +%F%n%T)>> "+op['ofccl_rawData'])
+                for a in buffer_sizes:
+                    os.system(op['runOfccl']+"  -b "+str(a)+" -e "+str(a)+" -f 2 -t " +str(MY_NUM_DEV)+" -g 1 -n "+str(n)+" -w "+str(w)+" -c 0 -M "+str(M) +" >>"+ op['ofccl_rawData'])
         if staticOfccl:
-            os.system("./ofccl/static_ofccl_bw.out " +OFCCL_RES_PATH+" " +OFCCL_OUTPUT_BW_PATH) 
-            os.system("./ofccl/static_ofccl_time.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TIME_PATH)
+            for op in [AR,AG,B,R,RS]:
+                os.system("./ofccl/static_ofccl_bw.out " +op['ofccl_rawData']+" " +op['ofccl_bw_path']) 
+                os.system("./ofccl/static_ofccl_time.out " +op['ofccl_rawData']+" " + op['ofccl_tm_path'])
         if staticOfcclExtral:
-            os.system("./ofccl/static_ofccl_QE.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_PATH)
-            os.system("./ofccl/static_ofccl_QE_ori.out " +OFCCL_RES_PATH+" " + OFCCL_OUTPUT_QE_ORI_PATH)
-            os.system("./ofccl/static_ofccl_totalCnt.out "+OFCCL_RES_PATH+" " + OFCCL_OUTPUT_TOTALCNT_PATH)
+            for op in [AR,AG,B,R,RS]:
+                os.system("./ofccl/static_ofccl_QE.out " +op['ofccl_rawData']+" " + op['ofccl_qe_path'])
+                os.system("./ofccl/static_ofccl_QE_ori.out " +op['ofccl_rawData']+" " + op['ofccl_qeOri_path'])
+                os.system("./ofccl/static_ofccl_totalCnt.out "+op['ofccl_rawData']+" " + op['ofccl_totalCnt_path'])
+    if staticOfccl:
+        for op in [AR,AG,B,R,RS]:
+            os.system("./ofccl/static_ofccl_bw_order.out "+op['ofccl_bw_path']+" "+op['ofccl_bw_order_path']+" "+ str(len(OFCCL_ITER)))
+            os.system("./ofccl/static_ofccl_tm_order.out "+op['ofccl_tm_path']+" "+op['ofccl_tm_order_path']+" "+ str(len(OFCCL_ITER)))
 
 
     if collectOfcclResult == True:
-        
-        with open(OFCCL_OUTPUT_BW_PATH) as f2:
-            content2 = f2.read()
-        bw = content2.split()
-        #bus        
-        for k in [0,1,2]:
-            AR_bwSheet.write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style)
+        #bus width
+        for op in [AR,AG,B,R,RS]:
+            with open(op['ofccl_bw_order_path']) as f2:
+                content2 = f2.read()
+            bw = content2.split()
+                    
+            for k in [0,1,2]:
+                op['bwSheet'].write(1+cnt*30,5+k,'ofccl-algbw'+str(k),style)
+                for i in range(0,25):
+                    op['bwSheet'].write(2+i+cnt*30,5+k,bw[i+k*50+2],style)
+
+                op['bwSheet'].write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style)
+                for i in range(0,25):
+                    op['bwSheet'].write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style)
+            # avg
+            op['bwSheet'].write(1+cnt*30,8, 'avg-algbw',style)
+            op['bwSheet'].write(1+cnt*30, 19, 'avg-busbw',style)
             for i in range(0,25):
-                AR_bwSheet.write(2+i+cnt*30,5+k,bw[i+k*50+2],style)
-
-            AR_bwSheet.write(1+cnt*30,16+k,'ofccl-busbw'+str(k),style)
+                op['bwSheet'].write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
+                op['bwSheet'].write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) 
+            
+            # time  
+            with open(op['ofccl_tm_order_path']) as f2:
+                content2 = f2.read()
+            times = content2.split()
+
+            for k in [0,1,2]:
+                op['tmSheet'].write(1+cnt*30,5+k,'ofccl-'+str(k),style)
+                for i in range(0,25):
+                    op['tmSheet'].write(2+i+cnt*30,5+k,times[i+k*25+2],style)
+            # avg 
+            op['tmSheet'].write(1+cnt*30, 4+4, 'avg-ofccl',style)
             for i in range(0,25):
-                AR_bwSheet.write(2+i+cnt*30,16+k,bw[i+k*50+25+2],style)
-        # avg
-        AR_bwSheet.write(1+cnt*30,8, 'avg-algbw',style)
-        AR_bwSheet.write(1+cnt*30, 19, 'avg-busbw',style)
-        for i in range(0,25):
-            AR_bwSheet.write(2+i+cnt*30, 8, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
-            AR_bwSheet.write(2+i+cnt*30, 19, xlwt.Formula('SUM(Q'+str(2+i+cnt*30+1)+',R'+str(2+i+cnt*30+1)+',S'+str(2+i+cnt*30+1)+')/3'),style) 
-        
-        # time  
-        with open(OFCCL_OUTPUT_TIME_PATH) as f2:
-            content2 = f2.read()
-        times = content2.split()
-
-        for k in [0,1,2]:
-            AR_tmSheet.write(1+cnt*30,5+k,'ofccl-'+str(k),style)
-            for i in range(0,25):
-                AR_tmSheet.write(2+i+cnt*30,5+k,times[i+k*25+2],style)
-        # avg 
-        AR_tmSheet.write(1+cnt*30, 4+4, 'avg-ofccl',style)
-        for i in range(0,25):
-            AR_tmSheet.write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
+                op['tmSheet'].write(2+i+cnt*30, 4+4, xlwt.Formula('SUM(F'+str(2+i+cnt*30+1)+',G'+str(2+i+cnt*30+1)+',H'+str(2+i+cnt*30+1)+')/3'), style)
 
     if collectNcclResult and collectOfcclResult:
-        AR_bwSheet.write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style)
-        AR_bwSheet.write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style)
-        AR_tmSheet.write(1+cnt*30, 9, 'ofccl-nccl',style)
-        AR_tmSheet.write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style)
-        for i in range(0,25):
-            AR_bwSheet.write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style)
-            AR_bwSheet.write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style)
-            AR_tmSheet.write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style )
-            AR_tmSheet.write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style )
+        for op in [AR,AG,B,R,RS]:
+            op['bwSheet'].write(1+cnt*30, 9, '(ofccl-nccl)/nccl',style)
+            op['bwSheet'].write(1+cnt*30, 20, '(ofccl-nccl)/nccl',style)
+            op['tmSheet'].write(1+cnt*30, 9, 'ofccl-nccl',style)
+            op['tmSheet'].write(1+cnt*30, 10, '(ofccl-nccl)/nccl',style)
+            for i in range(0,25):
+                op['bwSheet'].write(2+i+cnt*30, 9, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1)), style)
+                op['bwSheet'].write(2+i+cnt*30, 20, xlwt.Formula('(T'+str(2+i+cnt*30+1)+'-P'+str(2+i+cnt*30+1)+')/P'+str(2+i+cnt*30+1) ),style)
+                op['tmSheet'].write(2+i+cnt*30, 9, xlwt.Formula('I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1) ),style )
+                op['tmSheet'].write(2+i+cnt*30, 10, xlwt.Formula('(I'+str(2+i+cnt*30+1)+'-E'+str(2+i+cnt*30+1)+')/E'+str(2+i+cnt*30+1) ),style )
 
     # time 各个列的标题
     if staticOfcclExtral:
-        AR_tmSheet.write(1+cnt*30, 13,'nccl IO',style )
-        AR_tmSheet.write(1+cnt*30, 14,'nccl kern',style )
-        AR_tmSheet.write(1+cnt*30, 15,'ofccl-nccl kern',style )
-        AR_tmSheet.write(1+cnt*30, 16,'before after get sqe',style )
-        AR_tmSheet.write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style )
-        AR_tmSheet.write(1+cnt*30, 18,'before after put cqe',style )
-        AR_tmSheet.write(1+cnt*30, 19,'beforeSqe TO afterCqe',style )
-        AR_tmSheet.write(1+cnt*30, 20,'occl rank0 time',style )
-        AR_tmSheet.write(1+cnt*30, 21,'nccl kern ori',style )
-        AR_tmSheet.write(1+cnt*30, 27,'before after get sqe ori',style )
-        AR_tmSheet.write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style )
-        AR_tmSheet.write(1+cnt*30, 39,'before after put cqe ori',style )
-        AR_tmSheet.write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style )
-
-        y = 64
-        for i in range(0,25):
-            AR_tmSheet.write(2+i+cnt*30,12,y,style)
-            y = y*2    
-
-        with open(OFCCL_OUTPUT_QE_PATH) as f3:
-            content3 = f3.read()
-        times = content3.split()
-        with open(OFCCL_OUTPUT_QE_ORI_PATH) as f4:
-            content4 = f4.read()
-        times4 = content4.split()
-        for i in range(0,25):
-            AR_tmSheet.write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
-            AR_tmSheet.write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style )
-            AR_tmSheet.write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
-            AR_tmSheet.write(2+cnt*30+i,16,times[2+125*cnt+i],style)
-            AR_tmSheet.write(2+cnt*30+i,17,times[2+125*cnt+25+i],style)
-            AR_tmSheet.write(2+cnt*30+i,18,times[2+125*cnt+50+i],style)
-            AR_tmSheet.write(2+cnt*30+i,19,times[2+125*cnt+75+i],style)
-            AR_tmSheet.write(2+cnt*30+i,20,times[2+125*cnt+100+i],style)
-            for j in range(0,5):
-                AR_tmSheet.write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style)
-                AR_tmSheet.write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style)
-                AR_tmSheet.write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style)
-                AR_tmSheet.write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style)
-
-        # AR_cntSheet
-        AR_cntSheet.write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
-        axis_y =  buffer_sizes
-        for a in range(0,25):
-            AR_cntSheet.write(2+a+cnt*30,0,axis_y[a],style)
-
-        AR_cntSheet.write(1+cnt*30,1,"totalCtxSaveCnt_avg",style)
-        AR_cntSheet.write(1+cnt*30,2,"totalCtxLoadCnt_avg",style)
-        AR_cntSheet.write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style)
-        AR_cntSheet.write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style)
-        AR_cntSheet.write(1+cnt*30,6,"totalCtxSaveCnt",style)
-        AR_cntSheet.write(1+cnt*30,24,"totalCtxLoadCnt",style)
-        AR_cntSheet.write(1+cnt*30,42,"totalProgressed7SwithchCnt",style)
-        AR_cntSheet.write(1+cnt*30,60,"totalUnprogressedQuitCnt",style)
-
-        with  open(OFCCL_OUTPUT_TOTALCNT_PATH) as f:
-            line = f.readline()
-            # save
-            for i in range(0,25): 
-                numbers = line.split()
-                AR_cntSheet.write(i+2+cnt*30,1,numbers[0])
-                for j in range(1,len(numbers)):
-                    AR_cntSheet.write(i+2+cnt*30,5+j,numbers[j])
-                line = f.readline()
-            # load
-            for i in range(0,25): 
-                numbers = line.split()
-                AR_cntSheet.write(i+2+cnt*30,2,numbers[0])
-                for j in range(1,len(numbers)):
-                    AR_cntSheet.write(i+2+cnt*30,23+j,numbers[j])
-                line = f.readline()
-            # totalProgressed7SwithchCnt
-            for i in range(0,25): 
-                numbers = line.split()
-                AR_cntSheet.write(i+2+cnt*30,3,numbers[0])
-                for j in range(1,len(numbers)):
-                    AR_cntSheet.write(i+2+cnt*30,41+j,numbers[j])
-                line = f.readline()
-            # totalUnprogressedQuitCnt
-            for i in range(0,25): 
-                numbers = line.split()
-                AR_cntSheet.write(i+2+cnt*30,4,numbers[0])
-                for j in range(1,len(numbers)):
-                    AR_cntSheet.write(i+2+cnt*30,59+j,numbers[j])
+        for op in [AR,AG,B,R,RS]:
+            op['tmSheet'].write(1+cnt*30, 13,'nccl IO',style )
+            op['tmSheet'].write(1+cnt*30, 14,'nccl kern',style )
+            op['tmSheet'].write(1+cnt*30, 15,'ofccl-nccl kern',style )
+            op['tmSheet'].write(1+cnt*30, 16,'before after get sqe',style )
+            op['tmSheet'].write(1+cnt*30, 17,'AfterSqe TO BeforeCqe',style )
+            op['tmSheet'].write(1+cnt*30, 18,'before after put cqe',style )
+            op['tmSheet'].write(1+cnt*30, 19,'beforeSqe TO afterCqe',style )
+            op['tmSheet'].write(1+cnt*30, 20,'occl rank0 time',style )
+            op['tmSheet'].write(1+cnt*30, 21,'nccl kern ori',style )
+            op['tmSheet'].write(1+cnt*30, 27,'before after get sqe ori',style )
+            op['tmSheet'].write(1+cnt*30, 33,'AfterSqe TO BeforeCqe ori',style )
+            op['tmSheet'].write(1+cnt*30, 39,'before after put cqe ori',style )
+            op['tmSheet'].write(1+cnt*30, 45,'beforeSqe TO afterCqe ori',style )
+
+            y = 64
+            for i in range(0,25):
+                op['tmSheet'].write(2+i+cnt*30,12,y,style)
+                y = y*2    
+
+            with open(op['ofccl_qe_path']) as f3:
+                content3 = f3.read()
+            times = content3.split()
+            with open(op['ofccl_qeOri_path']) as f4:
+                content4 = f4.read()
+            times4 = content4.split()
+            for i in range(0,25):
+                op['tmSheet'].write(2+cnt*30+i, 13, xlwt.Formula('E'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
+                op['tmSheet'].write(2+cnt*30+i, 14, xlwt.Formula('AVERAGEA(V'+str(3+i+cnt*30)+':Z'+str(3+i+cnt*30)+' )' ),style )
+                op['tmSheet'].write(2+cnt*30+i, 15, xlwt.Formula('R'+str(3+i+cnt*30)+'-O'+str(3+i+cnt*30) ),style )
+                op['tmSheet'].write(2+cnt*30+i,16,times[2+125*cnt+i],style)
+                op['tmSheet'].write(2+cnt*30+i,17,times[2+125*cnt+25+i],style)
+                op['tmSheet'].write(2+cnt*30+i,18,times[2+125*cnt+50+i],style)
+                op['tmSheet'].write(2+cnt*30+i,19,times[2+125*cnt+75+i],style)
+                op['tmSheet'].write(2+cnt*30+i,20,times[2+125*cnt+100+i],style)
+                for j in range(0,5):
+                    op['tmSheet'].write(2+cnt*30+i,27+j,times4[2+500*cnt+i*5+j],style)
+                    op['tmSheet'].write(2+cnt*30+i,33+j,times4[2+500*cnt+125+i*5+j],style)
+                    op['tmSheet'].write(2+cnt*30+i,39+j,times4[2+500*cnt+250+i*5+j],style)
+                    op['tmSheet'].write(2+cnt*30+i,45+j,times4[2+500*cnt+375+i*5+j],style)
+
+            # cntSheet
+            op['cntSheet'].write(cnt*30,0,str(MY_NUM_DEV)+'卡',style)
+            axis_y =  buffer_sizes
+            for a in range(0,25):
+                op['cntSheet'].write(2+a+cnt*30,0,axis_y[a],style)
+
+            op['cntSheet'].write(1+cnt*30,1,"totalCtxSaveCnt_avg",style)
+            op['cntSheet'].write(1+cnt*30,2,"totalCtxLoadCnt_avg",style)
+            op['cntSheet'].write(1+cnt*30,3,"totalProgressed7SwithchCnt_avg",style)
+            op['cntSheet'].write(1+cnt*30,4,"totalUnprogressedQuitCnt_avg",style)
+            op['cntSheet'].write(1+cnt*30,6,"totalCtxSaveCnt",style)
+            op['cntSheet'].write(1+cnt*30,24,"totalCtxLoadCnt",style)
+            op['cntSheet'].write(1+cnt*30,42,"totalProgressed7SwithchCnt",style)
+            op['cntSheet'].write(1+cnt*30,60,"totalUnprogressedQuitCnt",style)
+
+            with  open(op['ofccl_totalCnt_path']) as f:
                 line = f.readline()
+                # save
+                for i in range(0,25): 
+                    numbers = line.split()
+                    op['cntSheet'].write(i+2+cnt*30,1,numbers[0])
+                    for j in range(1,len(numbers)):
+                        op['cntSheet'].write(i+2+cnt*30,5+j,numbers[j])
+                    line = f.readline()
+                # load
+                for i in range(0,25): 
+                    numbers = line.split()
+                    op['cntSheet'].write(i+2+cnt*30,2,numbers[0])
+                    for j in range(1,len(numbers)):
+                        op['cntSheet'].write(i+2+cnt*30,23+j,numbers[j])
+                    line = f.readline()
+                # totalProgressed7SwithchCnt
+                for i in range(0,25): 
+                    numbers = line.split()
+                    op['cntSheet'].write(i+2+cnt*30,3,numbers[0])
+                    for j in range(1,len(numbers)):
+                        op['cntSheet'].write(i+2+cnt*30,41+j,numbers[j])
+                    line = f.readline()
+                # totalUnprogressedQuitCnt
+                for i in range(0,25): 
+                    numbers = line.split()
+                    op['cntSheet'].write(i+2+cnt*30,4,numbers[0])
+                    for j in range(1,len(numbers)):
+                        op['cntSheet'].write(i+2+cnt*30,59+j,numbers[j])
+                    line = f.readline()
 
 
 
diff --git a/test_scripts/nccl/static_nccl.cpp b/test_scripts/nccl/static_nccl.cpp
index f12519a..911fd0c 100644
--- a/test_scripts/nccl/static_nccl.cpp
+++ b/test_scripts/nccl/static_nccl.cpp
@@ -6,36 +6,37 @@ int main(int argc,char* argv[]){
     freopen(argv[1],"r",stdin);
     freopen(argv[2],"a",stdout);
 
-    int ranks = *(argv[3]) - '0';
-    string str;
-    stringstream ss;
+   string inputLine;
     vector<string> a;
     vector<string> b;
-    string line;
-    // time
-    getline(cin,line);
+    string ss="bandwidth";
+    string str = "N/A";
+    int cnt = 0;
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
 
-    for(int t =0;t < 25;t++){
-        for(int i = 0;i < (7+ranks);i++)
-            getline(cin,line);
-        
-        for(int i =0;i < 6;i++)
-            cin >> str;
-
-        a.push_back(str);
-        cin >> str;
-        b.push_back(str);
-
-        
-        for(int i = 0;i < 4;i++)
-            getline(cin,line);        
+        stringstream line;
+        line << inputLine;
+        string tmp;
+        stack<string> ss;
+        while(line >> tmp){
+            ss.push(tmp);
+        }
+        ss.pop();
+        b.push_back(ss.top());
+        ss.pop();
+        a.push_back(ss.top());
         
+        if(++cnt == 25)
+            break;
     }
 
-    for(int i=0;i<a.size();i++)
-        cout << a[i] <<endl;
+    for(auto a1:a)
+        cout<<a1<<endl;
+
     cout <<endl;
-    for(int i=0;i<b.size();i++)
-        cout << b[i] <<endl;
-    cout<<endl<<endl;
+    for(auto b1:b)
+        cout<<b1<<endl;
+    cout << endl;
 }
\ No newline at end of file
diff --git a/test_scripts/nccl/static_time.cpp b/test_scripts/nccl/static_time.cpp
index 4a29f77..0d9d7b4 100644
--- a/test_scripts/nccl/static_time.cpp
+++ b/test_scripts/nccl/static_time.cpp
@@ -6,32 +6,34 @@ int main(int argc,char* argv[]){
     freopen(argv[1],"r",stdin);
     freopen(argv[2],"a",stdout);
 
-    int ranks = *(argv[3]) - '0';
-    string str;
-    stringstream ss;
+    string inputLine;
     vector<string> a;
     vector<string> b;
-    string line;
-    // time
-    getline(cin,line);
+    string ss="bandwidth";
+    string str = "N/A";
+    int cnt = 0;
+    while(getline(cin, inputLine)){
+        if (inputLine.find(str,0)  == -1)
+            continue;
 
-    for(int t =0;t < 25;t++){
-        for(int i = 0;i < (7+ranks);i++)
-            getline(cin,line);
-        
-        for(int i =0;i < 5;i++)
-            cin >> str;
-
-        a.push_back(str);
-       
-        for(int i = 0;i < 4;i++)
-            getline(cin,line);        
+        stringstream line;
+        line << inputLine;
+        string tmp;
+        stack<string> ss;
+        while(line >> tmp){
+            ss.push(tmp);
+        }
+        ss.pop();
+        ss.pop();
+        ss.pop();
+        a.push_back(ss.top());
         
+        if(++cnt == 25)
+            break;
     }
 
-    for(int i=0;i<a.size();i++)
-        cout << a[i] <<endl;
+    for(auto a1:a)
+        cout<<a1<<endl;
 
-    
-    cout<<endl<<endl;
+    cout <<endl<< endl;
 }
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_bw_order.cpp b/test_scripts/ofccl/static_ofccl_bw_order.cpp
new file mode 100644
index 0000000..d25df28
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_bw_order.cpp
@@ -0,0 +1,46 @@
+#include"bits/stdc++.h"
+using namespace std;
+int main(int argc,char* argv[])
+{
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    int num = *(argv[3]) - '0';
+
+
+    string time;
+    getline(cin, time);
+    vector<priority_queue<double,vector<double>,less<double>>> a(25,priority_queue<double,vector<double>,less<double>>());
+    vector<priority_queue<double,vector<double>,less<double>>> b(25,priority_queue<double,vector<double>,less<double>>());
+
+
+    for(int i = 0;i < num;i++){
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            cin>>tmp;
+            a[j].push(tmp);
+        }
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            cin>>tmp;
+            b[j].push(tmp);
+        }
+    }
+
+    for(int i = 0;i < num;i++){
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            tmp = a[j].top();a[j].pop();
+            cout<<tmp<<endl;
+        }
+        cout<<endl;
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            tmp = b[j].top();b[j].pop();
+            cout<<tmp<<endl;
+        }
+        cout<<endl<<endl;
+    }
+
+
+
+}
\ No newline at end of file
diff --git a/test_scripts/ofccl/static_ofccl_tm_order.cpp b/test_scripts/ofccl/static_ofccl_tm_order.cpp
new file mode 100644
index 0000000..05f3b1c
--- /dev/null
+++ b/test_scripts/ofccl/static_ofccl_tm_order.cpp
@@ -0,0 +1,35 @@
+#include"bits/stdc++.h"
+using namespace std;
+int main(int argc,char* argv[])
+{
+    freopen(argv[1],"r",stdin);
+    freopen(argv[2],"a",stdout);
+    int num = *(argv[3]) - '0';
+
+
+    string time;
+    getline(cin, time);
+    vector<priority_queue<double,vector<double>,greater<double>>> a(25,priority_queue<double,vector<double>,greater<double>>());
+  
+    for(int i = 0;i < num;i++){
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            cin>>tmp;
+            a[j].push(tmp);
+        }
+       
+    }
+
+    for(int i = 0;i < num;i++){
+        for(int j = 0;j < 25;j++){
+            double tmp;
+            tmp = a[j].top();a[j].pop();
+            cout<<tmp<<endl;
+        }
+        
+        cout<<endl<<endl;
+    }
+
+
+
+}
\ No newline at end of file

From 3bfb7500e87e37bdd91c57413a84700b8db4d24b Mon Sep 17 00:00:00 2001
From: zrk <zhangrongkai2000@163.com>
Date: Fri, 27 Jan 2023 03:16:15 +0000
Subject: [PATCH 109/109] =?UTF-8?q?=E5=8E=BB=E9=99=A4=20xlrd?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test_scripts/auto_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_scripts/auto_test.py b/test_scripts/auto_test.py
index 26536d5..1f8c9f1 100644
--- a/test_scripts/auto_test.py
+++ b/test_scripts/auto_test.py
@@ -1,5 +1,5 @@
 import os 
-import xlrd
+
 import xlwt
 # 设置字体大小
 style = xlwt.XFStyle()
@@ -19,8 +19,8 @@
 os.environ['DEV_TRY_ROUND'] = "10"
 
 # 设置超参数
-runNcclTest = False # 运行nccl测试,仅输出原始结果
-staticNccl = False # 运行统计，输出中间结果
+runNcclTest = True # 运行nccl测试,仅输出原始结果
+staticNccl = True # 运行统计，输出中间结果
 collectNcclResult  = True# 收集nccl测试结果，写入xls