diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yaml
new file mode 100644
index 000000000..78af009e8
--- /dev/null
+++ b/.github/workflows/doc-build.yaml
@@ -0,0 +1,34 @@
+name: Docs Build
+
+on:
+  pull_request:
+    branches:
+      - '**'
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y doxygen graphviz
+          pip install -r docs/requirements.txt
+
+      - name: Build docs
+        run: |
+          cd docs
+          doxygen
+          make html
+          touch _build/html/.nojekyll
diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml
new file mode 100644
index 000000000..1c2645edf
--- /dev/null
+++ b/.github/workflows/gh-pages.yml
@@ -0,0 +1,58 @@
+name: GitHub Pages
+
+on:
+  push:
+    branches:
+    - main
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y doxygen graphviz
+          pip install -r docs/requirements.txt
+      - name: Build docs
+        run: |
+          cd docs
+          doxygen
+          make html
+          touch _build/html/.nojekyll
+      - name: Upload artifacts
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: docs/_build/html
+
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 000000000..11da73bb2
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,36 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  apt_packages:
+    - doxygen
+  tools:
+    python: "3.12"
+  jobs:
+    pre_build:
+    - cd docs && doxygen
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#   - pdf
+#   - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/requirements.txt
diff --git a/docs/.gitignore b/docs/.gitignore
index 00d9344fb..a69fac7ab 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,3 +1,2 @@
 doxygen/
 _build/
-sphinx/
diff --git a/docs/README.md b/docs/README.md
index 2bb9c1efb..80d80b16c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -3,13 +3,13 @@
 1. Install `doxygen`.
 
     ```bash
-    $ sudo apt-get install doxygen
+    $ sudo apt-get install doxygen graphviz
     ```
 
 2. Install Python packages below. If you install them on the user's local, you need to include `~/.local/bin` to `$PATH` (to use `sphinx-build`).
 
     ```bash
-    $ sudo python3 -m pip install sphinx sphinx_rtd_theme breathe
+    $ sudo python3 -m pip install -r ./requirements.txt
     ```
 
 3. Create Doxygen documents.
@@ -21,7 +21,7 @@
 4. Create Sphinx documents.
 
     ```bash
-    $ sphinx-build -b html -Dbreathe_projects.mscclpp=$PWD/doxygen/xml $PWD $PWD/sphinx
+    $ make html
     ```
 
-5. Done. The HTML files will be on `sphinx/` directory.
+5. Done. The HTML files will be on `_build/` directory.
diff --git a/docs/api/index.rst b/docs/api/index.rst
new file mode 100644
index 000000000..461a9fbdb
--- /dev/null
+++ b/docs/api/index.rst
@@ -0,0 +1,5 @@
+API Reference
+=============
+
+.. doxygennamespace:: mscclpp
+   :members:
diff --git a/docs/conf.py b/docs/conf.py
index 4a94b3aa5..4d3a91022 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -14,12 +14,13 @@
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
-extensions = ["breathe"]
+extensions = ["breathe", "myst_parser"]
 
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
 # Breathe configuration
+breathe_projects = {"mscclpp": "./doxygen/xml"}
 breathe_default_project = "mscclpp"
 
 # -- Options for HTML output -------------------------------------------------
diff --git a/docs/design/design.md b/docs/design/design.md
new file mode 100644
index 000000000..82b6e0965
--- /dev/null
+++ b/docs/design/design.md
@@ -0,0 +1,157 @@
+# MSCCL++ Design Document
+## Introduction
+MSCCL++ redefines inter-GPU communication interfaces, thereby delivering a highly efficient and customizable communication stack for distributed GPU applications. Its design is specifically tailored to accommodate diverse performance optimization scenarios often encountered in state-of-the-art AI applications. The figure below provides a high-level overview of MSCCL++ abstractions in CUDA, C, and Python.
+
+
+```{figure} ../figs/abstractions.png
+:name: MSCCL++ Abstractions
+:alt: MSCCL++ Abstractions
+:align: center
+
+MSCCL++ Abstractions Overview
+```
+
+The followings highlight the key features of MSCCL++.
+* **Light-weight and multi-layer abstractions.** MSCCL++ provides communication abstractions at lowest level close to hardware and at the highest level close to application API. The lowest level of abstraction is ultra light weight which enables a user to implement logics of data movement for a collective operation such as AllReduce inside a GPU kernel extremely efficiently without worrying about memory ordering of different ops. The modularity of MSCCL++ enables a user to construct the building blocks of MSCCL++ in a high level abstraction in Python and feed them to a CUDA kernel in order to facilitate the user's productivity.
+
+* **1-sided 0-copy synchronous and asynchronous abstracts.** MSCCL++ provides fine-grained synchronous and asynchronous 0-copy 1-sided abstracts for communication primitives such as `put()`, `get()`, `signal()`, `flush()`, and `wait()`. The 1-sided abstractions allows a user to asynchronously `put()` their data on the remote GPU as soon as it is ready without requiring the remote side to issue any receive instruction. This enables users to easily implement flexible communication logics, such as overlapping communication with computation, or implementing customized collective communication algorithms without worrying about potential deadlocks. Additionally, the 0-copy capability enables MSCCL++ to directly transfer data between user's buffers without using intermediate internal buffers which saves GPU bandwidth and memory capacity.
+
+* **Unified abstractions for different interconnection hardware.** MSCCL++ provides consistent abstractions regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink/xGMI or InfiniBand). This simplifies the code for inter-GPU communication, which is often complex due to memory ordering of GPU/CPU read/writes and therefore, is error-prone.
+
+## Concepts
+
+To implement the list of features above, some concepts are introduced.
+### Channel
+MSCCL++ provides peer-to-peer communication methods between GPUs. A peer-to-peer connection between two GPUs is called a *Channel*. Channels are constructed by MSCCL++ host-side interfaces and copied to GPUs during initialization. Channels provide *GPU-side interfaces*, which means that all communication methods are defined as a device function to be called from a GPU kernel code. Following code shows the basic usage for channel, the `put()` method in the following code copies 1KB data from the local GPU to a remote GPU.
+```cpp
+__global__ void gpuKernel() {
+  ...
+  // Only one thread is needed for this method.
+  channel.put(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024);
+  ...
+}
+```
+MSCCL++ also provides efficient synchronization methods, `signal()`, `flush()`, and `wait()`. We will discuss these methods in the following sections.
+
+#### SmChannel & ProxyChannel
+MSCCL++ delivers two types of channels, **ProxyChannel** and **SmChannel**. `ProxyChannel` provides (R)DMA-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy (hence the name `ProxyChannel`), which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, ProxyChannel requires only a single GPU thread to call its methods. See all `ProxyChannel` methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/proxy_channel_device.hpp).
+
+On the other hand, `SmChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against ProxyChannel, SmChannel is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all SmChannel methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/sm_channel_device.hpp).
+
+### Fifo & Trigger
+One of the key features of MSCCL++ is to offload the communication logic from the GPU to the CPU.
+To offload the communication logic from the GPU to the CPU, MSCCL++ introduces the concept of `Fifo` and `Trigger`. A Fifo is a circular buffer that shared between the GPU and the CPU. It is used to store `Trigger`. A `Trigger` is a signal that is sent from the GPU to the CPU to notify the CPU that there are commands in the Fifo that need to be processed. The CPU will then process the commands in the Fifo and send a signal back to the GPU to notify the GPU that the commands have been processed. The implementation details of Fifo and Trigger can be found in following sections.
+
+### ProxyService
+Proxy service is a persistent service that resides in the CPU side. It functions as a polling service that receives the message `Trigger` from the GPU side and then transfers data according to the command.  When we use `ProxyChannel` for communication, a `Trigger` is sent from the GPU side to the `ProxyService`. Then `ProxyService` will invoke `cudaMemcpy*` or `IB verbs` to transfer data to the targe device.
+
+## Implementation
+
+The core of MSCCL++ is implemented in C++ and CUDA. We offer both C++ and Python APIs for initializing communication channels. For interactions within the GPU kernel, we offer a collection of low-level device functions. Subsequent sections will delve into these interfaces and the methodology for transferring communication logic from the GPU to the CPU.
+
+### Interfaces
+This section delivers a comprehensive overview of the MSCCL++ interfaces, encompassing both the setup and initialization of communication channels and the MSCCL++ kernel programming model.
+
+#### Communication setup and initialization APIs
+MSCCL++ provides APIs in both C++ and Python for establishing communication channels, with further information available in the [Initialization](../getting-started/tutorials/initialization.md) section. Presently, it supports two types of transports: `cudaIPC` for `NVLink/xGMI`, and `IB` for `InfiniBand`. Users are empowered to select the connection type that best suits their hardware infrastructure.
+
+#### MSCCL++ kernel programming model
+MSCCL++ offers one-sided communication methods directly callable from a GPU kernel, encompassing two primary API categories: data copy and synchronization. The data copy API features functions such as `put()`, `get()`, `read()`, and `write()`, while the synchronization API comprises `signal()`, `flush()`, and `wait()`. Demonstrated below, the basic utilization of the data copy API involves the `put()` method, which facilitates the transfer of 1KB of data from a local GPU to a remote GPU. Then send a signal to remote peer to notify the data is ready to use. To receive the data, the remote peer can call `wait()` method.
+This operation is executed within a kernel launched with a single block.
+```cpp
+// Running on rank 0
+__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChannel) {
+  smChannel[0].put(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    smChannel[0].signal();
+  }
+}
+
+// Running on rank 1
+__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChannel) {
+  if (threadIdx.x == 0) {
+    smChannel[0].wait();
+  }
+  __syncthreads();
+  // Data is ready to use
+}
+```
+
+Similar to the LL protocol offered by NCCL, MSCCL++ introduces a `Packet` structure designed to facilitate the transfer of both data and flags within a single instruction, proving particularly beneficial for applications where latency is a critical concern. The following code shows the basic usage of the `Packet` structure. The flag should be same for sender and receiver side.
+```cpp
+// Running on rank 0
+__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChans, int flag) {
+  smChans[0].putPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x,
+                        /*flag=*/ flag);
+}
+
+// Running on rank 1
+__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChans, int flag) {
+  smChans[0].getPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x,
+                        /*flag=*/ flag);
+  // Data is ready to use
+}
+```
+
+### The mechanism for offloading communication logic from the GPU to the CPU
+
+As mentioned in the previous section, the offloading of communication logic from the GPU to the CPU is accomplished through the `Fifo` and `Trigger` mechanism.
+
+The accompanying figure details the structure of `Tigger`, employing three bits to denote the operation type: `data transfer`, `signal`, and `flush`. The remaining fields specify the precise data locations for both local and remote buffers.
+
+```
+|-------------------|-------------------|-------------------|-----------------|-----------------|---------|-------------------|---------------|
+|    32bit size     |  32bit src offset |  32bit dst offset | 9bit src mem id | 9bit dst mem id | 3bit op | 10bit channel id  | 1bit reserved |
+|-------------------|-------------------|-------------------|-----------------|-----------------|---------|-------------------|---------------|
+```
+<center>The proxy trigger format</center>
+
+Page-locked memory is utilized for the `Fifo`, guaranteeing access by both the CPU and GPU. On the CPU side, a polling thread periodically checks the Fifo for new commands. Upon processing a command, it updates an incremented counter to signal to the GPU that the command has been executed. Users wishing to ensure a command has been processed can invoke `flush()`, which waits for the device-side counter to reflect this update.
+
+## Use Cases
+
+In this section, we will discuss several use cases that demonstrate the capabilities of MSCCL++.
+
+### Overlapping communication with computation
+
+MSCCL++ enables the offloading of communication logic from the GPU to the CPU, facilitating the overlapping of communication and computation processes. The code snippet provided illustrates this overlapping technique. In the depicted scenario, the GPU emits a signal to the CPU indicating readiness for data transfer. Subsequently, while the GPU continues to execute computation tasks, the CPU initiates the data transfer to the designated target device.
+```cpp
+__device__ void gpuKernel(mscclpp::SimpleProxyChannelDeviceHandle* proxyChannel) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  // Send a trigger to the CPU
+  if (tid == 0) {
+    proxyChannel[0].putWithSignal(/*dstOffset*/ 0, /*srcOffset*/ 0, /*size*/ 1024);
+  }
+  // Continue computation
+  matrixMul()
+  // ...
+}
+```
+
+### Fusion of communication and computation
+
+Traditional communication libraries enforce a separation between communication and computation, creating a bottleneck where communication must await the completion of computation, especially when data dependencies exist. In contrast, MSCCL++ leverages its low-level premitives to facilitate the seamless integration of communication with computation. By segmenting the computation into tiles, MSCCL++ enables the simultaneous pipelining of computation and communication tasks. This approach not only mitigates the communication delay by overlapping processes but also significantly improves throughput by leveraging the low-level API for fine-grained control over the hardware, ensuring optimal efficiency.
+
+### Implementing customized collective communication algorithms
+
+MCSCL++ offers a low-level communication API, allowing users to design customized collective communication algorithms. The following code demonstrates how to implement a customized All2All algorithm using MSCCL++.
+```cpp
+using DeviceHandle = mscclpp::DeviceHandle<T>;
+__device__ void localAlltoall(DeviceHandle<mscclpp::SimpleProxyChannel>* proxyChans, int rank,
+                              int nRanksPerNode, size_t nElements) {
+  int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
+  for (int i = 1; i < nRanksPerNode; i++) {
+    DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = proxyChans[blockIdx.x];
+    if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) {
+      proxyChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
+                                      nElements * sizeof(int));
+    }
+    // wait for the data from GPU (rank-i) % nranksPerNode to arrive
+    if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank - i + nRanksPerNode) % nRanksPerNode) {
+      proxyChan.wait();
+    }
+    deviceSyncer.sync(nRanksPerNode - 1);
+  }
+}
+```
diff --git a/apps/nccl/README.md b/docs/design/nccl-over-mscclpp.md
similarity index 55%
rename from apps/nccl/README.md
rename to docs/design/nccl-over-mscclpp.md
index 364feeed2..ca362e9b5 100644
--- a/apps/nccl/README.md
+++ b/docs/design/nccl-over-mscclpp.md
@@ -1,6 +1,7 @@
-## NCCL Over MSCCL++
+# NCCL Over MSCCL++
 
-### Limitations
+(limitations)=
+## Limitations
 
 Current NCCL over MSCCL++ has a few limitations.
 
@@ -8,7 +9,8 @@ Current NCCL over MSCCL++ has a few limitations.
 * Multi-node communication is not supported yet.
 * Currently, collective communication functions may not work correctly if the buffer address is differed from that of previous function calls while sharing the same base address (returned by [cuMemGetAddressRange](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g64fee5711274a2a0573a789c94d8299b)) with the previous address. This is because the current implementation performs zero-copy communication over user buffers, and it is difficult to efficiently inform all ranks if the buffer address dynamically changes.
 
-### API Support Table
+(api-support-table)=
+## API Support Table
 
 The table below lists all NCCL APIs (v2.21). We may cover more APIs in the future.
 
@@ -44,3 +46,26 @@ The table below lists all NCCL APIs (v2.21). We may cover more APIs in the futur
 | ncclRecv                 | X         |
 | ncclRedOpCreatePreMulSum | X         |
 | ncclRedOpDestroy         | X         |
+
+## Executor Support
+
+The executor is a versatile tool designed to specify how mscclpp executes algorithms. Currently, only the allReduce operation allows for algorithm customization. The following environment variables can be managed:
+
+- ALLREDUCEPKT_IP_JSON_FILE: Specifies the path to the JSON file that defines the algorithm for small-sized, in-place operations.
+- ALLREDUCEPKT_OP_JSON_FILE: Specifies the path to the JSON file that defines the algorithm for small-sized, out-of-place operations.
+- ALLREDUCE_IP_JSON_FILE: Specifies the path to the JSON file that defines the algorithm for larger-sized, in-place operations.
+- ALLREDUCE_OP_JSON_FILE: Specifies the path to the JSON file that defines the algorithm for larger-sized, out-of-place operations.
+- ALLREDUCE_SMALL_MSG_BOUNDARY: Defines the size threshold at which the algorithm will switch between fallback code and the customized algorithm for small messages.
+- ALLREDUCE_LARGE_MSG_BOUNDARY: Defines the size threshold at which the algorithm will switch between the customized algorithm for small messages and that for larger messages.
+
+```{figure} ../figs/size_boundary_diagram.png
+:name: MMSCCL++ Abstractions
+:alt: MSCCL++ Abstractions
+:align: center
+
+Decision Flowchart for Message Size-Based Algorithm Execution
+```
+
+This is an example of executing the interface with the executor:
+``` bash
+mpirun -np 8 -x ALLREDUCEPKT_IP_JSON_FILE=/root/azure-mscclpp/nccl/test/execution-files/allreducepacket.json -x ALLREDUCE_IP_JSON_FILE=/root/azure-mscclpp/nccl/test/execution-files/allreducesm.json -x ALLREDUCE_SMALL_MSG_BOUNDARY=16K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./apps/nccl/test/nccl_api_test
diff --git a/docs/figs/size_boundary_diagram.png b/docs/figs/size_boundary_diagram.png
new file mode 100644
index 000000000..41e3a38cb
Binary files /dev/null and b/docs/figs/size_boundary_diagram.png differ
diff --git a/docs/quickstart.md b/docs/getting-started/quickstart.md
similarity index 90%
rename from docs/quickstart.md
rename to docs/getting-started/quickstart.md
index d09ce0baf..8c0982e3e 100644
--- a/docs/quickstart.md
+++ b/docs/getting-started/quickstart.md
@@ -20,6 +20,17 @@
         lsmod | grep nvidia_peermem
         ```
 
+## Build with Docker Images
+
+We provide docker images which package all prerequisites for MSCCL++. You can setup your dev environment with the following command.
+
+```bash
+$ docker run -it --privileged --net=host --ipc=host --gpus all ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 mscclpp-dev bash
+```
+
+See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp).
+
+(build-from-source)=
 ## Build from Source
 
 CMake 3.25 or later is required.
@@ -54,6 +65,7 @@ $ make -j mscclpp mscclpp_static
 $ sudo make install/fast
 ```
 
+(install-from-source-python-module)=
 ## Install from Source (Python Module)
 
 Python 3.8 or later is required.
@@ -101,7 +113,7 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.
 
 ### Python Benchmark
 
-[Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
+[Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
 
 ```bash
 # Choose `requirements_*.txt` according to your CUDA/ROCm version.
@@ -163,4 +175,4 @@ mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/app
 
 If MSCCL++ is built on AMD platforms, `libmscclpp_nccl.so` would replace the [RCCL](https://github.com/ROCm/rccl) library (i.e., `librccl.so`).
 
-See limitations of the current NCCL over MSCCL++ from [here](../apps/nccl/README.md#limitations).
+See limitations of the current NCCL over MSCCL++ from [here](../design/nccl-over-mscclpp.md#limitations).
diff --git a/docs/getting-started/tutorials/customized-proxy-service.md b/docs/getting-started/tutorials/customized-proxy-service.md
new file mode 100644
index 000000000..232f81066
--- /dev/null
+++ b/docs/getting-started/tutorials/customized-proxy-service.md
@@ -0,0 +1 @@
+# Customize the Proxy Service
diff --git a/docs/getting-started/tutorials/index.rst b/docs/getting-started/tutorials/index.rst
new file mode 100644
index 000000000..7ee91b194
--- /dev/null
+++ b/docs/getting-started/tutorials/index.rst
@@ -0,0 +1,16 @@
+Tutorials
+----------
+
+This tutorial section provides a step-by-step guide to help you get started with the C++/Python API.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Tutorials
+   :hidden:
+
+   initialization
+   proxy-channel
+   sm-channel
+   packet-api
+   customized-proxy-service
+   python-api
diff --git a/docs/getting-started/tutorials/initialization.md b/docs/getting-started/tutorials/initialization.md
new file mode 100644
index 000000000..0bdd8ad45
--- /dev/null
+++ b/docs/getting-started/tutorials/initialization.md
@@ -0,0 +1,71 @@
+# Commnunication initialize with mscclpp API
+
+In this tutorial, you will write a simple program to initialize communication between eight GPUs using MSCCL++ C++ API. You will also learn how to use the Python API to initialize communication.
+
+## Prerequisites
+A system with eight GPUs is required to run this tutorial.
+
+Also make sure that you have installed MSCCL++ on your system. If not, please follow the [quick start](../quickstart.md).
+
+## Initialize Communication with C++ API
+We will setup a mesh topology with eight GPUs. Each GPU will be connected to its neighbors. The following code shows how to initialize communication with MSCCL++ C++ API.
+
+```cpp
+#include <mscclpp/core.hpp>
+#include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/proxy_channel.hpp>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+template <class T>
+using DeviceHandle = mscclpp::DeviceHandle<T>;
+__constant__ DeviceHandle<mscclpp::SimpleProxyChannel> constProxyChans[8];
+
+void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
+  std::string ip_port = "10.0.0.4:50000";
+  auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, worldsize);
+  bootstrap->initialize(ip_port);
+  mscclpp::Communicator comm(bootstrap);
+  mscclpp::ProxyService proxyService;
+
+  std::vector<mscclpp::SemaphoreId> semaphoreIds;
+  std::vector<mscclpp::RegisteredMemory> localMemories;
+  std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connections(world_size);
+  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteMemories;
+
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) continue;
+    mscclpp::Transport transport = mscclpp::Transport::CudaIpc;
+    // Connect with all other ranks
+    connections[r] = comm.connectOnSetup(r, 0, transport);
+    auto memory = comm.registerMemory(data, dataSize, mscclpp::Transport::CudaIpc | ibTransport);
+    localMemories.push_back(memory);
+    comm.sendMemoryOnSetup(memory, r, 0);
+    remoteMemories.push_back(comm.recvMemoryOnSetup(r, 0));
+  }
+
+  comm.setup();
+
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) continue;
+    semaphoreIds.push_back(proxyService.buildAndAddSemaphore(comm, connections[r].get()));
+  }
+
+  comm.setup();
+
+  std::vector<DeviceHandle<mscclpp::SimpleProxyChannel>> proxyChannels;
+  for (size_t i = 0; i < semaphoreIds.size(); ++i) {
+    proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::SimpleProxyChannel(
+        proxyService.proxyChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()),
+        proxyService.addMemory(localMemories[i]))));
+  }
+
+  if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>)) {
+    std::runtime_error("unexpected error");
+  }
+  CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
+                              sizeof(DeviceHandle<mscclpp::SimpleProxyChannel>) * proxyChannels.size()));
+}
+```
diff --git a/docs/getting-started/tutorials/packet-api.md b/docs/getting-started/tutorials/packet-api.md
new file mode 100644
index 000000000..8f4ea7074
--- /dev/null
+++ b/docs/getting-started/tutorials/packet-api.md
@@ -0,0 +1 @@
+# Packet API for latency sensitive applications
diff --git a/docs/getting-started/tutorials/proxy-channel.md b/docs/getting-started/tutorials/proxy-channel.md
new file mode 100644
index 000000000..fec5c4cc0
--- /dev/null
+++ b/docs/getting-started/tutorials/proxy-channel.md
@@ -0,0 +1,3 @@
+# Offload commnunication to CPU with ProxyChannel
+
+TBU
diff --git a/docs/getting-started/tutorials/python-api.md b/docs/getting-started/tutorials/python-api.md
new file mode 100644
index 000000000..9e6c5627b
--- /dev/null
+++ b/docs/getting-started/tutorials/python-api.md
@@ -0,0 +1,92 @@
+# Working with Python API
+
+We provide Python API which help to initialze and setup the channel easily.
+In this tutorial, you will write a simple program to initialize communication between eight GPUs using MSCCL++ Python API.
+
+## Setup Channel with Python API
+
+We will setup a mesh topology with eight GPUs. Each GPU will be connected to its neighbors. The following code shows how to initialize communication with MSCCL++ Python API.
+```python
+from mpi4py import MPI
+import cupy as cp
+
+from mscclpp import (
+    ProxyService,
+    Transport,
+)
+import mscclpp.comm as mscclpp_comm
+
+def create_connection(group: mscclpp_comm.CommGroup, transport: str):
+    remote_nghrs = list(range(group.nranks))
+    remote_nghrs.remove(group.my_rank)
+    if transport == "NVLink":
+        tran = Transport.CudaIpc
+    elif transport == "IB":
+        tran = group.my_ib_device(group.my_rank % 8)
+    else:
+        assert False
+    connections = group.make_connection(remote_nghrs, tran)
+    return connections
+
+if __name__ == "__main__":
+    mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD)
+    connections = create_connection(mscclpp_group, "NVLink")
+    nelems = 1024
+    memory = cp.zeros(nelem, dtype=cp.int32)
+    proxy_service = ProxyService()
+    simple_channels = group.make_proxy_channels(proxy_service, memory, connections)
+    proxy_service.start_proxy()
+    mscclpp_group.barrier()
+    launch_kernel(mscclpp_group.my_rank, mscclpp_group.nranks, simple_channels, memory)
+    cp.cuda.runtime.deviceSynchronize()
+    mscclpp_group.barrier()
+```
+
+### Launch Kernel with Python API
+We provide some Python utils to help you launch kernel via python. Here is a exampl.
+```python
+from mscclpp.utils import KernelBuilder, pack
+
+def launch_kernel(my_rank: int, nranks: int, simple_channels: List[SimpleProxyChannel], memory: cp.ndarray):
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+    kernel = KernelBuilder(file="test.cu", kernel_name="test", file_dir=file_dir).get_compiled_kernel()
+    params = b""
+    first_arg = next(iter(simple_channels.values()))
+    size_of_channels = len(first_arg.device_handle().raw)
+    device_handles = []
+    for rank in range(nranks):
+        if rank == my_rank:
+            device_handles.append(
+                bytes(size_of_channels)
+            )  # just zeros for semaphores that do not exist
+        else:
+            device_handles.append(simple_channels[rank].device_handle().raw)
+    # keep a reference to the device handles so that they don't get garbage collected
+    d_channels = cp.asarray(memoryview(b"".join(device_handles)), dtype=cp.uint8)
+    params = pack(d_channels, my_rank, nranks, memory.size)
+
+    nblocks = 1
+    nthreads = 512
+    kernel.launch_kernel(params, nblocks, nthreads, 0, None)
+```
+
+The test kernel is defined in `test.cu` as follows:
+```cuda
+#include <mscclpp/packet_device.hpp>
+#include <mscclpp/proxy_channel_device.hpp>
+
+// be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    simple_proxy_channel(mscclpp::SimpleProxyChannelDeviceHandle* channels, int my_rank, int nranks,
+                         int num_elements) {
+    int tid = threadIdx.x;
+    int nthreads = blockDim.x;
+    uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks;
+    uint64_t my_offset = size_per_rank * my_rank;
+    __syncthreads();
+    if (tid < nranks && tid != my_rank) {
+      channels[tid].putWithSignalAndFlush(my_offset, my_offset, size_per_rank);
+      channels[tid].wait();
+    }
+}
+```
diff --git a/docs/getting-started/tutorials/sm-channel.md b/docs/getting-started/tutorials/sm-channel.md
new file mode 100644
index 000000000..191e47b36
--- /dev/null
+++ b/docs/getting-started/tutorials/sm-channel.md
@@ -0,0 +1,3 @@
+# Using SmChannel for Intra-Node Communication
+
+TBU
diff --git a/docs/index.rst b/docs/index.rst
index ba060047c..dc5604364 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -6,11 +6,56 @@
 Welcome to MSCCL++'s documentation!
 ===================================
 
+MSCCL++ is a GPU-driven communication stack for scalable AI applications. It is designed to provide a high-performance, scalable, and customizable communication stack for distributed GPU applications.
+
+Getting Started
+---------------
+- Follow the :doc:`quick start <getting-started/quickstart>` for your platform of choice.
+- Take a look at the :doc:`tutorials <getting-started/tutorials/index>` to learn how to write your first mscclpp program.
+
+.. toctree::
+   :maxdepth: 1
+   :caption:  Getting Started
+   :hidden:
+
+   getting-started/quickstart
+   getting-started/tutorials/index
+
+Design
+-------
+- :doc:`Design <design/design>` doc for those who want to understand the internals of MSCCL++.
+- :doc:`NCCL over MSCCL++ <design/nccl-over-mscclpp>` doc for those who want to understand how to use NCCL over MSCCL++.
+
+.. toctree::
+   :maxdepth: 1
+   :caption:  Design
+   :hidden:
+
+   design/design
+   design/nccl-over-mscclpp
+
+Performance
+---------------
+- We evaluate the performance of MSCCL++ in A100 and H100. Here are some :doc:`performance results <performance/performance-ndmv4>` for all-reduce operations.
+
 .. toctree::
-   :maxdepth: 2
-   :caption: Contents:
+   :maxdepth: 1
+   :caption:  Performance
+   :hidden:
+
+   performance/performance-ndmv4
+
+C++ API
+---------------
+- :doc:`mscclpp <api/index>`
 
 
+.. toctree::
+   :maxdepth: 1
+   :caption: C++ API
+   :hidden:
+
+   api/index
 
 Indices and tables
 ==================
@@ -18,9 +63,3 @@ Indices and tables
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
-
-Docs
-====
-
-.. doxygennamespace:: mscclpp
-   :members:
diff --git a/docs/performance-ndmv4.md b/docs/performance/performance-ndmv4.md
similarity index 100%
rename from docs/performance-ndmv4.md
rename to docs/performance/performance-ndmv4.md
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 000000000..82bb70d03
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,3 @@
+breathe
+sphinx_rtd_theme
+myst_parser