diff --git a/.gitignore b/.gitignore
index 173af9f96..f3aea954f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ _ext
 build/
 dist/
 *.egg-info/
+demos/
 
 # ipython/jupyter notebooks
 # *.ipynb
diff --git a/docs/source/user_guide/concepts/images/voxel_cam_view_all.png b/docs/source/user_guide/concepts/images/voxel_cam_view_all.png
new file mode 100644
index 000000000..8d28df85c
Binary files /dev/null and b/docs/source/user_guide/concepts/images/voxel_cam_view_all.png differ
diff --git a/docs/source/user_guide/concepts/images/voxel_cam_view_one.png b/docs/source/user_guide/concepts/images/voxel_cam_view_one.png
new file mode 100644
index 000000000..42055b7a1
Binary files /dev/null and b/docs/source/user_guide/concepts/images/voxel_cam_view_one.png differ
diff --git a/docs/source/user_guide/concepts/images/voxel_pushcube.png b/docs/source/user_guide/concepts/images/voxel_pushcube.png
new file mode 100644
index 000000000..299d2157e
Binary files /dev/null and b/docs/source/user_guide/concepts/images/voxel_pushcube.png differ
diff --git a/docs/source/user_guide/concepts/images/voxel_pushcube_complete.png b/docs/source/user_guide/concepts/images/voxel_pushcube_complete.png
new file mode 100644
index 000000000..457a9edc7
Binary files /dev/null and b/docs/source/user_guide/concepts/images/voxel_pushcube_complete.png differ
diff --git a/docs/source/user_guide/concepts/observation.md b/docs/source/user_guide/concepts/observation.md
index fd252ffee..a5b70d11f 100644
--- a/docs/source/user_guide/concepts/observation.md
+++ b/docs/source/user_guide/concepts/observation.md
@@ -93,6 +93,106 @@ For a quick demo to visualize pointclouds, you can run
 python -m mani_skill.examples.demo_vis_pcd -e "PushCube-v1"
 ```
 
+### voxel
+
+#### Keyword arguments
+This observation mode has the same data format as the [sensor_data mode](#sensor_data), but all sensor data from cameras are removed and instead a new key is added called `voxel_grid`.
+
+To use this observation mode, a dictionary of observation config parameters is required via obs_mode_config during environment initializations (gym.make()). It contains the following keys:
+
+- `coord_bounds`: `[torch.float32, torch.float32, torch.float32, torch.float32, torch.float32, torch.float32]` It has form **[x_min, y_min, z_min, x_max, y_max, z_max]**  defining the metric volume to be voxelized.
+- `voxel_size`: `torch.int` Defining the side length of each voxel, assuming that all voxels are cubic.
+- `device`: `torch.device` The device on which the voxelization takes place. Something like **torch.device("cuda" if torch.cuda.is_available() else "cpu")**
+- `segmentation`: `bool` Defining whether or not to estimate voxel segmentations using the point cloud segmentations. If true then num_channels=11 (including one channel for voxel segmentation), otherwise num_channels=10.
+
+Some tabletop tasks with simple scene like PushCube-v1 have default voxel observation mode configs available. In this case, you can easily initialize the environment via:
+
+```python
+gym.make("PushCube-v1", obs_mode="voxel")
+```
+
+You can override the default observation mode configs easily by just passing the obs_mode_config keyword arguments following the above dictionary formats:
+
+```python
+env = gym.make("PushCube-v1", obs_mode="voxel", obs_mode_config={"coord_bounds":..., "voxel_size":..., "device":..., "segmentation":...})
+```
+
+For those tasks with non-trivial scene like TwoRobotStackCube-v1, you will need one (or even more) camera with larger width and height to film the complete scene. In this case, you need to pass in customized obs_mode_config keyword arguments while using the voxel observation mode instead of relying on the default voxel observation configs.
+
+
+#### Voxel grid structure
+
+Then, as you step throught the environment you created and get observations, you can see the extra key `voxel_grid` indicating the voxel grid generated:
+
+
+- `voxel_grid`: `[torch.int, torch.int, torch.int, torch.int, torch.int]` It has form **[N, voxel_size, voxel_size, voxel_size, num_channels]**. Voxel grids generated by fusing the point cloud and rgb data from all cameras. `N` is the batch size. `voxel_size` is the side length of the voxel, as indicated in voxelization configs. `num_channels` indicates the number of feature channels for each voxel. 
+
+- The voxel features have at most 11 channels: 
+>- `voxel_grid[..., :3]`: Average xyz coordinates of the points falling into the voxel
+>- `voxel_grid[..., 3:6]`: Average RGB of the points falling into the voxel
+>- `voxel_grid[..., 6:9]`: Voxel xyz indices
+>- `voxel_grid[..., 9]`: if `segmentation=True`, then this channel would be the voxel segmentation id. It is computed by counting the majority class of the points falling into the voxel. If `segmentation=False`, then this channel would be the occupancy value indicating whether this voxel has any point falling in.
+>- `voxel_grid[..., 10]`: If `segmentation=True`, this channel would be the occupancy value indicating whether this voxel has any point falling in, and hence there will be in total 11 channels. If `segmentation=False`, then this channel doesn't exist, and hence there will only be 10 channels in total.
+
+
+The voxel grid can be visualized below. This is an image showing the voxelized scene of PushCube-v1 with slightly-tuned default hyperparameters. The voxel grid is reconstructed from the front camera, following the default camera settings of the PuchCube-v1 task, and hence it only contains the front voxels instead of the voxels throughout the scene.
+
+```{image} images/voxel_pushcube.png
+---
+alt: Voxelized PushCube-v1 scene at the initial state
+---
+```
+
+The RGBD image data used to reconstruct the voxel scene above is shown in the following figure. Here we use only one base camera in PushCube-v1 task.
+
+```{image} images/voxel_cam_view_one.png
+---
+alt: Corresponding RGBD observations
+---
+```
+
+#### Visualization demos
+
+For a quick demo to visualize voxel grids, you can run
+
+```bash
+python -m mani_skill.examples.demo_vis_voxel -e "PushCube-v1" --voxel-size 200 --zoom-factor 2 --coord-bounds -1 -1 -1 2 2 2 --segmentation --device cuda
+```
+
+to override the default observation mode config kwargs. Or simply 
+
+```bash
+python -m mani_skill.examples.demo_vis_voxel -e "PushCube-v1" 
+```
+
+When using the default kwarg settings to override the default task obs_mode_config settings. 
+
+If you don't want any override to occur and only want to use the default task obs_mode_config settings (defined in the **_default_voxel_config** property of each task), you can run
+
+```bash
+python -m mani_skill.examples.demo_vis_voxel -e "PushCube-v1" --use-default True
+```
+
+Note that this only works for some tabletop tasks like PickCube-v1 and PushCube-v1. You still need to tune those configs for your own task and use your customized configs when needed.
+
+Furthermore, if you use more sensors (currently only RGB and depth cameras) to film the scene and collect more point cloud and RGB data from different poses, you can get a more accurate voxel grid reconstruction of the scene. Figure below gives a more completely reconstructed voxel scene of PushCube-v1 using more RGBD cameras.
+
+```{image} images/voxel_pushcube_complete.png
+---
+alt: Densely voxelized PushCube-v1 scene at the initial state
+---
+```
+
+It is reconstructed using 5 cameras located at the up, left, right, front, and back of the tabletop scene, respectively, as shown in the visualized RGBD observations below.
+
+```{image} images/voxel_cam_view_all.png
+---
+alt: Corresponding RGBD observations
+---
+```
+
+
+
 ## Segmentation Data
 
 Objects upon being loaded are automatically assigned a segmentation ID (the `per_scene_id` attribute of `sapien.Entity` objects). To get information about which IDs refer to which Actors / Links, you can run the code below
@@ -114,4 +214,4 @@ Note that ID 0 refers to the distant background. For a quick demo of this, you c
 ```bash
 python -m mani_skill.examples.demo_vis_segmentation -e "PushCube-v1" # plot all segmentations
 python -m mani_skill.examples.demo_vis_segmentation -e "PushCube-v1" --id cube # mask everything but the object with name "cube" 
-```
+```
\ No newline at end of file
diff --git a/docs/source/user_guide/demos/images/voxel_pushcube.png b/docs/source/user_guide/demos/images/voxel_pushcube.png
new file mode 100644
index 000000000..299d2157e
Binary files /dev/null and b/docs/source/user_guide/demos/images/voxel_pushcube.png differ
diff --git a/docs/source/user_guide/demos/index.md b/docs/source/user_guide/demos/index.md
index 54e85204a..f030d7a8b 100644
--- a/docs/source/user_guide/demos/index.md
+++ b/docs/source/user_guide/demos/index.md
@@ -194,6 +194,19 @@ python -m mani_skill.examples.demo_vis_rgbd -e "StackCube-v1"
 ```{figure}  images/rgbd_vis.png
 ```
 
+## Visualize Voxel Data
+
+You can run the following to visualize the voxelized data. It will give you the following voxelized scene under the default sensor settings with only 1 camera at the front of the scene.
+
+```bash
+python -m mani_skill.examples.demo_vis_voxel -e "PushCube-v1" 
+```
+
+
+```{figure}  images/voxel_pushcube.png
+```
+
+
 ## Visualize Reset Distributions
 
 Determining how difficult a task might be for ML algorithms like reinforcement learning and imitation learning can heavily depend on the reset distribution of the task. To see what the reset distribution of any task (the result of repeated env.reset calls) looks like you can run the following to save a video to the `videos` folder
diff --git a/mani_skill/envs/sapien_env.py b/mani_skill/envs/sapien_env.py
index 9a26669ba..bf5fec5f4 100644
--- a/mani_skill/envs/sapien_env.py
+++ b/mani_skill/envs/sapien_env.py
@@ -23,6 +23,7 @@
 from mani_skill.envs.utils.observations import (
     parse_visual_obs_mode_to_struct,
     sensor_data_to_pointcloud,
+    sensor_data_to_voxel,
 )
 from mani_skill.sensors.base_sensor import BaseSensor, BaseSensorConfig
 from mani_skill.sensors.camera import (
@@ -78,6 +79,10 @@ class BaseEnv(gym.Env):
 
         human_render_camera_configs (dict): configurations of human rendering cameras to override any environment defaults. Similar usage as @sensor_configs.
 
+        obs_mode_config (dict): configuration hyperparameters of observations. See notes for more details.
+
+        human_render_camera_cfgs (dict): configurations of human rendering cameras. Similar usage as @sensor_cfgs.
+        
         viewer_camera_configs (dict): configurations of the viewer camera in the GUI to override any environment defaults. Similar usage as @sensor_configs.
 
         robot_uids (Union[str, BaseAgent, List[Union[str, BaseAgent]]]): List of robots to instantiate and control in the environment.
@@ -111,7 +116,8 @@ class BaseEnv(gym.Env):
     SUPPORTED_ROBOTS: List[Union[str, Tuple[str]]] = None
     """Override this to enforce which robots or tuples of robots together are supported in the task. During env creation,
     setting robot_uids auto loads all desired robots into the scene, but not all tasks are designed to support some robot setups"""
-    SUPPORTED_OBS_MODES = ("state", "state_dict", "none", "sensor_data", "rgb", "depth", "segmentation", "rgbd", "rgb+depth", "rgb+depth+segmentation", "rgb+segmentation", "depth+segmentation", "pointcloud")
+
+    SUPPORTED_OBS_MODES = ("state", "state_dict", "none", "sensor_data", "rgb", "depth", "segmentation", "rgbd", "rgb+depth", "rgb+depth+segmentation", "rgb+segmentation", "depth+segmentation", "pointcloud", "voxel")
     SUPPORTED_REWARD_MODES = ("normalized_dense", "dense", "sparse", "none")
     SUPPORTED_RENDER_MODES = ("human", "rgb_array", "sensors", "all")
     """The supported render modes. Human opens up a GUI viewer. rgb_array returns an rgb array showing the current environment state.
@@ -135,6 +141,8 @@ class BaseEnv(gym.Env):
     """all sensor configurations parsed from self._sensor_configs and agent._sensor_configs"""
     _agent_sensor_configs: Dict[str, BaseSensorConfig]
     """all agent sensor configs parsed from agent._sensor_configs"""
+    _obs_mode_config: Dict
+    """configurations for converting sensor data to observations under the current observation mode (e.g. voxel size and scene bounds for voxel observations)"""
     _human_render_cameras: Dict[str, Camera]
     """cameras used for rendering the current environment retrievable via `env.render_rgb_array()`. These are not used to generate observations"""
     _default_human_render_camera_configs: Dict[str, CameraConfig]
@@ -173,6 +181,7 @@ def __init__(
         enable_shadow: bool = False,
         sensor_configs: Optional[dict] = dict(),
         human_render_camera_configs: Optional[dict] = dict(),
+        obs_mode_config: dict = None,
         viewer_camera_configs: Optional[dict] = dict(),
         robot_uids: Union[str, BaseAgent, List[Union[str, BaseAgent]]] = None,
         sim_config: Union[SimConfig, dict] = dict(),
@@ -195,6 +204,12 @@ def __init__(
         self._custom_viewer_camera_configs = viewer_camera_configs
         self._parallel_in_single_scene = parallel_in_single_scene
         self.robot_uids = robot_uids
+
+        if obs_mode_config is None:
+            self._obs_mode_config = self._default_voxel_config
+        else:
+            self._obs_mode_config = obs_mode_config
+            
         if self.SUPPORTED_ROBOTS is not None:
             if robot_uids not in self.SUPPORTED_ROBOTS:
                 logger.warn(f"{robot_uids} is not in the task's list of supported robots. Code may not run as intended")
@@ -486,6 +501,15 @@ def get_obs(self, info: Optional[Dict] = None):
             obs = self._get_obs_with_sensor_data(info, apply_texture_transforms=False)
         elif self._obs_mode in ["rgb", "depth", "segmentation", "rgbd", "rgb+depth", "rgb+depth+segmentation", "depth+segmentation", "rgb+segmentation"]:
             obs = self._get_obs_with_sensor_data(info)
+        elif self._obs_mode == "voxel":
+            # assert on _obs_mode_config here, and pass them to the convertion function
+            assert self._obs_mode_config != None, "You mush pass in configs in voxel observation mode via obs_mode_config keyword arg in gym.make(). See the Maniskill docs for details. No such config detected."
+            assert "voxel_size" in self._obs_mode_config.keys(), "Lacking voxel_size (voxel size) in observation configs"
+            assert "coord_bounds" in self._obs_mode_config.keys(), "Lacking coord_bounds (coordinate bounds) in observation configs"
+            assert "device" in self._obs_mode_config.keys(), "Lacking device (device for voxelizations) in observation configs"
+            assert "segmentation" in self._obs_mode_config.keys(), "Lacking segmentation (a boolean indicating whether including voxel segmentations) in observation configs"
+            obs = self._get_obs_with_sensor_data(info)
+            obs = sensor_data_to_voxel(obs, self._sensors, self._obs_mode_config)
         else:
             raise NotImplementedError(self._obs_mode)
         return obs
diff --git a/mani_skill/envs/tasks/control/cartpole.py b/mani_skill/envs/tasks/control/cartpole.py
index a1416ebbe..4c2dfd5f5 100644
--- a/mani_skill/envs/tasks/control/cartpole.py
+++ b/mani_skill/envs/tasks/control/cartpole.py
@@ -105,7 +105,7 @@ def _default_sensor_configs(self):
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at(eye=[0, -4, 1], target=[0, 0, 1])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
-
+    
     def _load_scene(self, options: dict):
         loader = self.scene.create_mjcf_loader()
         articulation_builders, actor_builders, sensor_configs = loader.parse(MJCF_FILE)
diff --git a/mani_skill/envs/tasks/fmb/fmb.py b/mani_skill/envs/tasks/fmb/fmb.py
index 7009d9c86..86f6b6c4a 100644
--- a/mani_skill/envs/tasks/fmb/fmb.py
+++ b/mani_skill/envs/tasks/fmb/fmb.py
@@ -62,7 +62,7 @@ def _default_sensor_configs(self):
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([1.0, 0.8, 0.8], [0.0, 0.0, 0.35])
         return CameraConfig("render_camera", pose, 1024, 1024, 1, 0.01, 100)
-
+    
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             env=self, robot_init_qpos_noise=self.robot_init_qpos_noise
diff --git a/mani_skill/envs/tasks/humanoid/humanoid_stand.py b/mani_skill/envs/tasks/humanoid/humanoid_stand.py
index 4d6717b1a..e008050c0 100644
--- a/mani_skill/envs/tasks/humanoid/humanoid_stand.py
+++ b/mani_skill/envs/tasks/humanoid/humanoid_stand.py
@@ -34,7 +34,7 @@ def _default_sensor_configs(self):
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([1.0, 1.0, 2.5], [0.0, 0.0, 0.75])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
-
+    
     def _load_scene(self, options: dict):
         build_ground(self.scene)
 
diff --git a/mani_skill/envs/tasks/mobile_manipulation/open_cabinet_drawer.py b/mani_skill/envs/tasks/mobile_manipulation/open_cabinet_drawer.py
index c22dc1c35..a1f901af6 100644
--- a/mani_skill/envs/tasks/mobile_manipulation/open_cabinet_drawer.py
+++ b/mani_skill/envs/tasks/mobile_manipulation/open_cabinet_drawer.py
@@ -87,7 +87,7 @@ def _default_human_render_camera_configs(self):
         return CameraConfig(
             "render_camera", pose=pose, width=512, height=512, fov=1, near=0.01, far=100
         )
-
+    
     def _load_scene(self, options: dict):
         self.ground = build_ground(self.scene)
         # temporarily turn off the logging as there will be big red warnings
diff --git a/mani_skill/envs/tasks/tabletop/assembling_kits.py b/mani_skill/envs/tasks/tabletop/assembling_kits.py
index f0719b44d..5d2c5ffbb 100644
--- a/mani_skill/envs/tasks/tabletop/assembling_kits.py
+++ b/mani_skill/envs/tasks/tabletop/assembling_kits.py
@@ -80,6 +80,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.3, 0.3, 0.8], [0.0, 0.0, 0.1])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         with torch.device(self.device):
             self.table_scene = TableSceneBuilder(self)
diff --git a/mani_skill/envs/tasks/tabletop/lift_peg_upright.py b/mani_skill/envs/tasks/tabletop/lift_peg_upright.py
index 15dee9a8c..fc1b3c210 100644
--- a/mani_skill/envs/tasks/tabletop/lift_peg_upright.py
+++ b/mani_skill/envs/tasks/tabletop/lift_peg_upright.py
@@ -39,6 +39,13 @@ def _default_human_render_camera_configs(self):
         pose = look_at([0.6, 0.7, 0.6], [0.0, 0.0, 0.35])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             env=self, robot_init_qpos_noise=self.robot_init_qpos_noise
diff --git a/mani_skill/envs/tasks/tabletop/peg_insertion_side.py b/mani_skill/envs/tasks/tabletop/peg_insertion_side.py
index ba7d77655..30065685f 100644
--- a/mani_skill/envs/tasks/tabletop/peg_insertion_side.py
+++ b/mani_skill/envs/tasks/tabletop/peg_insertion_side.py
@@ -89,6 +89,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.5, -0.5, 0.8], [0.05, -0.1, 0.4])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+                    
     def _load_scene(self, options: dict):
         with torch.device(self.device):
             self.table_scene = TableSceneBuilder(self)
diff --git a/mani_skill/envs/tasks/tabletop/pick_clutter_ycb.py b/mani_skill/envs/tasks/tabletop/pick_clutter_ycb.py
index 89a42880d..5dcaddf00 100644
--- a/mani_skill/envs/tasks/tabletop/pick_clutter_ycb.py
+++ b/mani_skill/envs/tasks/tabletop/pick_clutter_ycb.py
@@ -73,6 +73,13 @@ def _default_sim_config(self):
             )
         )
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     @property
     def _default_sensor_configs(self):
         pose = sapien_utils.look_at(eye=[0.3, 0, 0.6], target=[-0.1, 0, 0.1])
diff --git a/mani_skill/envs/tasks/tabletop/pick_cube.py b/mani_skill/envs/tasks/tabletop/pick_cube.py
index 70a1f91b4..228d97577 100644
--- a/mani_skill/envs/tasks/tabletop/pick_cube.py
+++ b/mani_skill/envs/tasks/tabletop/pick_cube.py
@@ -36,6 +36,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.6, 0.7, 0.6], [0.0, 0.0, 0.35])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             self, robot_init_qpos_noise=self.robot_init_qpos_noise
diff --git a/mani_skill/envs/tasks/tabletop/pick_single_ycb.py b/mani_skill/envs/tasks/tabletop/pick_single_ycb.py
index 17d907c65..9d929c9c1 100644
--- a/mani_skill/envs/tasks/tabletop/pick_single_ycb.py
+++ b/mani_skill/envs/tasks/tabletop/pick_single_ycb.py
@@ -69,7 +69,7 @@ def _default_sensor_configs(self):
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.6, 0.7, 0.6], [0.0, 0.0, 0.35])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
-
+    
     def _load_scene(self, options: dict):
         global WARNED_ONCE
         self.table_scene = TableSceneBuilder(
diff --git a/mani_skill/envs/tasks/tabletop/place_sphere.py b/mani_skill/envs/tasks/tabletop/place_sphere.py
index 1f18461ef..c00490ed2 100644
--- a/mani_skill/envs/tasks/tabletop/place_sphere.py
+++ b/mani_skill/envs/tasks/tabletop/place_sphere.py
@@ -70,18 +70,9 @@ def _default_sim_config(self):
 
     @property
     def _default_sensor_configs(self):
-        pose = sapien_utils.look_at(eye=[0.3, 0, 0.2], target=[-0.1, 0, 0])
-        return [
-            CameraConfig(
-                "base_camera",
-                pose=pose,
-                width=128,
-                height=128,
-                fov=np.pi / 2,
-                near=0.01,
-                far=100,
-            )
-        ]
+        pose = sapien_utils.look_at(eye=[0.3, 0, 0.6], target=[-0.1, 0, 0.1])
+        return [CameraConfig("base_camera", pose, 128, 128, np.pi / 2, 0.01, 100)]
+
 
     @property
     def _default_human_render_camera_configs(self):
@@ -89,7 +80,14 @@ def _default_human_render_camera_configs(self):
         return CameraConfig(
             "render_camera", pose=pose, width=512, height=512, fov=1, near=0.01, far=100
         )
-
+  
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+        
     def _build_bin(self, radius):
         builder = self.scene.create_actor_builder()
 
diff --git a/mani_skill/envs/tasks/tabletop/plug_charger.py b/mani_skill/envs/tasks/tabletop/plug_charger.py
index a82bc8326..59113c61d 100644
--- a/mani_skill/envs/tasks/tabletop/plug_charger.py
+++ b/mani_skill/envs/tasks/tabletop/plug_charger.py
@@ -44,7 +44,14 @@ def _default_sensor_configs(self):
         return [
             CameraConfig("base_camera", pose=pose, width=128, height=128, fov=np.pi / 2)
         ]
-
+    
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+        
     @property
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.3, 0.4, 0.1], [0, 0, 0])
diff --git a/mani_skill/envs/tasks/tabletop/poke_cube.py b/mani_skill/envs/tasks/tabletop/poke_cube.py
index 32e3796bb..67078f40f 100644
--- a/mani_skill/envs/tasks/tabletop/poke_cube.py
+++ b/mani_skill/envs/tasks/tabletop/poke_cube.py
@@ -40,6 +40,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.6, 0.7, 0.6], [0.2, 0.2, 0.35])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             self, robot_init_qpos_noise=self.robot_init_qpos_noise
diff --git a/mani_skill/envs/tasks/tabletop/pull_cube.py b/mani_skill/envs/tasks/tabletop/pull_cube.py
index f41976bd2..95e43bc5b 100644
--- a/mani_skill/envs/tasks/tabletop/pull_cube.py
+++ b/mani_skill/envs/tasks/tabletop/pull_cube.py
@@ -37,6 +37,13 @@ def _default_human_render_camera_configs(self):
         pose = look_at([0.6, 0.7, 0.6], [0.0, 0.0, 0.35])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             env=self, robot_init_qpos_noise=self.robot_init_qpos_noise
diff --git a/mani_skill/envs/tasks/tabletop/push_cube.py b/mani_skill/envs/tasks/tabletop/push_cube.py
index 1942653d3..54f031eae 100644
--- a/mani_skill/envs/tasks/tabletop/push_cube.py
+++ b/mani_skill/envs/tasks/tabletop/push_cube.py
@@ -74,7 +74,14 @@ def _default_sim_config(self):
                 found_lost_pairs_capacity=2**25, max_rigid_patch_count=2**18
             )
         )
-
+    
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     @property
     def _default_sensor_configs(self):
         # registers one 128x128 camera looking at the robot, cube, and target
diff --git a/mani_skill/envs/tasks/tabletop/push_t.py b/mani_skill/envs/tasks/tabletop/push_t.py
index ac303ae34..775485158 100644
--- a/mani_skill/envs/tasks/tabletop/push_t.py
+++ b/mani_skill/envs/tasks/tabletop/push_t.py
@@ -173,6 +173,13 @@ def _default_human_render_camera_configs(self):
             "render_camera", pose=pose, width=512, height=512, fov=1, near=0.01, far=100
         )
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         # have to put these parmaeters to device - defined before we had access to device
         # load scene is a convienent place for this one time operation
diff --git a/mani_skill/envs/tasks/tabletop/roll_ball.py b/mani_skill/envs/tasks/tabletop/roll_ball.py
index 7e65a4620..0e176874e 100644
--- a/mani_skill/envs/tasks/tabletop/roll_ball.py
+++ b/mani_skill/envs/tasks/tabletop/roll_ball.py
@@ -63,6 +63,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([-0.6, 1.3, 0.8], [0.0, 0.13, 0.0])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             self, robot_init_qpos_noise=self.robot_init_qpos_noise
diff --git a/mani_skill/envs/tasks/tabletop/stack_cube.py b/mani_skill/envs/tasks/tabletop/stack_cube.py
index f76fb284b..e1a07b4ed 100644
--- a/mani_skill/envs/tasks/tabletop/stack_cube.py
+++ b/mani_skill/envs/tasks/tabletop/stack_cube.py
@@ -31,6 +31,13 @@ def _default_sensor_configs(self):
         pose = sapien_utils.look_at(eye=[0.3, 0, 0.6], target=[-0.1, 0, 0.1])
         return [CameraConfig("base_camera", pose, 128, 128, np.pi / 2, 0.01, 100)]
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     @property
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.6, 0.7, 0.6], [0.0, 0.0, 0.35])
diff --git a/mani_skill/envs/tasks/tabletop/turn_faucet.py b/mani_skill/envs/tasks/tabletop/turn_faucet.py
index 479c1be55..ba14e59b6 100644
--- a/mani_skill/envs/tasks/tabletop/turn_faucet.py
+++ b/mani_skill/envs/tasks/tabletop/turn_faucet.py
@@ -68,6 +68,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.5, 0.5, 1.0], [0.0, 0.0, 0.5])
         return CameraConfig("render_camera", pose=pose, width=512, height=512, fov=1)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         self.scene_builder = TableSceneBuilder(
             self, robot_init_qpos_noise=self.robot_init_qpos_noise
diff --git a/mani_skill/envs/tasks/tabletop/two_robot_pick_cube.py b/mani_skill/envs/tasks/tabletop/two_robot_pick_cube.py
index 2411d95ae..6a5053eab 100644
--- a/mani_skill/envs/tasks/tabletop/two_robot_pick_cube.py
+++ b/mani_skill/envs/tasks/tabletop/two_robot_pick_cube.py
@@ -73,6 +73,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([1.4, 0.8, 0.75], [0.0, 0.1, 0.1])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             env=self, robot_init_qpos_noise=self.robot_init_qpos_noise
diff --git a/mani_skill/envs/tasks/tabletop/two_robot_stack_cube.py b/mani_skill/envs/tasks/tabletop/two_robot_stack_cube.py
index 62637e94f..003ca3241 100644
--- a/mani_skill/envs/tasks/tabletop/two_robot_stack_cube.py
+++ b/mani_skill/envs/tasks/tabletop/two_robot_stack_cube.py
@@ -78,6 +78,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at(eye=[0.6, 0.2, 0.4], target=[-0.1, 0, 0.1])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+    
     def _load_scene(self, options: dict):
         self.cube_half_size = common.to_tensor([0.02] * 3)
         self.table_scene = TableSceneBuilder(
diff --git a/mani_skill/envs/utils/observations/__init__.py b/mani_skill/envs/utils/observations/__init__.py
index 86b7af6e0..85adf452c 100644
--- a/mani_skill/envs/utils/observations/__init__.py
+++ b/mani_skill/envs/utils/observations/__init__.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass
 
 from .observations import *
+from .voxelizer import *
+
 
 
 @dataclass
diff --git a/mani_skill/envs/utils/observations/observations.py b/mani_skill/envs/utils/observations/observations.py
index 52bfaa09a..ed2fb83f6 100644
--- a/mani_skill/envs/utils/observations/observations.py
+++ b/mani_skill/envs/utils/observations/observations.py
@@ -12,7 +12,7 @@
 from mani_skill.sensors.base_sensor import BaseSensor, BaseSensorConfig
 from mani_skill.sensors.camera import Camera
 from mani_skill.utils import common
-
+from mani_skill.envs.utils.observations.voxelizer import VoxelGrid
 
 def sensor_data_to_pointcloud(observation: Dict, sensors: Dict[str, BaseSensor]):
     """convert all camera data in sensor to pointcloud data"""
@@ -66,3 +66,109 @@ def sensor_data_to_pointcloud(observation: Dict, sensors: Dict[str, BaseSensor])
     #         observation["pointcloud"]["segmentation"].numpy().astype(np.uint16)
     #     )
     return observation
+
+def sensor_data_to_voxel(
+    observation: Dict, 
+    sensors: Dict[str, BaseSensor],
+    obs_mode_config: Dict
+    ):
+    """convert all camera data in sensor to voxel grid"""
+    sensor_data = observation["sensor_data"]
+    camera_params = observation["sensor_param"]
+    coord_bounds = obs_mode_config["coord_bounds"] # [x_min, y_min, z_min, x_max, y_max, z_max] - the metric volume to be voxelized
+    voxel_size = obs_mode_config["voxel_size"] # size of the voxel grid (assuming cubic)
+    device = obs_mode_config["device"] # device on which doing voxelization
+    seg = obs_mode_config["segmentation"] # device on which doing voxelization
+    pcd_rgb_observations = dict()
+    bb_maxs = torch.tensor(coord_bounds[3:6]) # max voxel scene boundary
+    bb_maxs = bb_maxs.to(torch.float)
+    bb_maxs = bb_maxs.unsqueeze(0)
+
+    # Collect all cameras' observations
+    for (cam_uid, images), (sensor_uid, sensor) in zip(
+        sensor_data.items(), sensors.items()
+    ):
+        assert cam_uid == sensor_uid
+        if isinstance(sensor, Camera):
+            cam_data = {}
+            
+            # Extract point cloud and segmentation data
+            images: Dict[str, torch.Tensor]
+            position = images["PositionSegmentation"]
+            if seg:
+                segmentation = position[..., 3].clone()
+            position[..., 3] = position[..., 3] != 0 # mask out infinitely far points
+            position = position.float() # position -> camera-frame xyzw coordinates
+
+            # Record w=0 (infinitely far) points' indices
+            out_indices = (position[..., 3] == 0)
+            out_indices = out_indices.reshape(out_indices.shape[0], -1)            
+
+            # Convert to world space position and update camera data
+            position[..., 3] = 1 # for matrix multiplication
+            position[..., :3] = (
+                position[..., :3] / 1000.0
+            )  # convert the raw depth from millimeters to meters
+            cam2world = camera_params[cam_uid]["cam2world_gl"]
+            xyzw = position.reshape(position.shape[0], -1, 4) @ cam2world.transpose(
+                1, 2
+            )
+            
+            # Set w=0 points outside the bounds, so that they can be cropped during voxelization
+            xyz = xyzw[..., :3] / xyzw[..., 3].unsqueeze(-1) # dehomogeneize
+            xyz[out_indices, :] = bb_maxs + 1
+            cam_data["xyz"] = xyz
+
+            # Extract seg and rgb data 
+            if seg:
+                cam_data["seg"] = segmentation.reshape(segmentation.shape[0], -1, 1)
+            if "Color" in images:
+                rgb = images["Color"][..., :3].clone()
+                rgb = rgb / 255 # convert to range [0, 1]
+                cam_data["rgb"] = rgb.reshape(rgb.shape[0], -1, 3)
+
+            pcd_rgb_observations[cam_uid] = cam_data
+
+    # just free sensor_data to save memory
+    for k in pcd_rgb_observations.keys():
+        del observation["sensor_data"][k]
+
+    # merge features from different cameras together
+    pcd_rgb_observations = common.merge_dicts(pcd_rgb_observations.values())
+    for key, value in pcd_rgb_observations.items():
+        pcd_rgb_observations[key] = torch.concat(value, axis=1)
+    
+    # prepare features for voxel convertions
+    xyz_dev = pcd_rgb_observations["xyz"].to(device)    
+    rgb_dev = pcd_rgb_observations["rgb"].to(device)    
+    if seg:
+        seg_dev = pcd_rgb_observations["seg"].to(device)    
+    coord_bounds = torch.tensor(coord_bounds, device=device).unsqueeze(0)
+    batch_size = xyz_dev.shape[0]
+    max_num_coords = rgb_dev.shape[1]
+    vox_grid = VoxelGrid(
+        coord_bounds=coord_bounds,
+        voxel_size=voxel_size,
+        device=device,
+        batch_size=batch_size,
+        feature_size=3,
+        max_num_coords=max_num_coords,
+    )
+
+    # convert to the batched voxel grids
+    # voxel 11D features contain: 3 (pcd xyz coordinates) + 3 (rgb) + 3 (voxel xyz indices) + 1 (seg id if applicable) + 1 (occupancy)
+    if seg: # add voxel segmentations
+        voxel_grid = vox_grid.coords_to_bounding_voxel_grid(xyz_dev,
+                                                    coord_features=rgb_dev,
+                                                    coord_bounds=coord_bounds,
+                                                    clamp_vox_id=True, 
+                                                    pcd_seg=seg_dev) 
+    else: # no voxel segmentation
+        voxel_grid = vox_grid.coords_to_bounding_voxel_grid(xyz_dev,
+                                                    coord_features=rgb_dev,
+                                                    coord_bounds=coord_bounds,
+                                                    clamp_vox_id=False) 
+
+    # update voxel grids to the observation dict
+    observation["voxel_grid"] = voxel_grid 
+    return observation
\ No newline at end of file
diff --git a/mani_skill/envs/utils/observations/voxelizer.py b/mani_skill/envs/utils/observations/voxelizer.py
new file mode 100644
index 000000000..8a0eac7c4
--- /dev/null
+++ b/mani_skill/envs/utils/observations/voxelizer.py
@@ -0,0 +1,238 @@
+# Modified from https://github.com/stepjam/ARM/blob/main/arm/c2farm/voxel_grid.py
+from functools import reduce as funtool_reduce
+from operator import mul
+
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+
+MIN_DENOMINATOR = 1e-12
+INCLUDE_PER_VOXEL_COORD = False
+
+
+class VoxelGrid(nn.Module):
+
+    def __init__(self,
+                 coord_bounds,
+                 voxel_size: int,
+                 device,
+                 batch_size,
+                 feature_size,
+                 max_num_coords: int,):
+        super(VoxelGrid, self).__init__()
+        self._device = device
+        self._voxel_size = voxel_size
+        self._voxel_shape = [voxel_size] * 3
+        self._voxel_d = float(self._voxel_shape[-1])
+        self._voxel_feature_size = 4 + feature_size
+        self._voxel_shape_spec = torch.tensor(self._voxel_shape,
+                                              device=device).unsqueeze(
+            0) + 2  # +2 because we crop the edges.
+        self._coord_bounds = torch.tensor(coord_bounds, dtype=torch.float,
+                                          device=device).unsqueeze(0)
+        max_dims = self._voxel_shape_spec[0]
+        self._total_dims_list = torch.cat(
+            [torch.tensor([batch_size], device=device), max_dims,
+             torch.tensor([4 + feature_size], device=device)], -1).tolist() 
+        self._ones_max_coords = torch.ones((batch_size, max_num_coords, 1),
+                                           device=device)
+        self._num_coords = max_num_coords
+
+        shape = self._total_dims_list
+
+        self._result_dim_sizes = torch.tensor(
+            [funtool_reduce(mul, shape[i + 1:], 1) for i in range(len(shape) - 1)] + [
+                1], device=device)
+        flat_result_size = funtool_reduce(mul, shape, 1)
+
+        self._initial_val = torch.tensor(0, dtype=torch.float,
+                                         device=device)
+        self._flat_output = torch.ones(flat_result_size, dtype=torch.float,
+                                       device=device) * self._initial_val
+        self._arange_to_max_coords = torch.arange(4 + feature_size,
+                                                  device=device)
+        self._flat_zeros = torch.zeros(flat_result_size, dtype=torch.float,
+                                       device=device)
+
+        self._const_1 = torch.tensor(1.0, device=device)
+        self._batch_size = batch_size
+
+        # Coordinate Bounds:
+        self._bb_mins = self._coord_bounds[..., 0:3]
+        bb_maxs = self._coord_bounds[..., 3:6]
+        bb_ranges = bb_maxs - self._bb_mins
+
+        # get voxel dimensions. 'DIMS' mode
+        self._dims = dims = self._voxel_shape_spec.int() # the number of voxels for each dimension after padding
+        self._dims_orig = dims_orig = self._voxel_shape_spec.int() - 2 # voxel number for each dimension before padding
+        self._dims_m_one = (dims - 1).int() # upper voxel index bounds
+        
+        # BS x 1 x 3
+        self._res = bb_ranges / (dims_orig.float() + MIN_DENOMINATOR)
+        self._res_minis_2 = bb_ranges / (dims.float() - 2 + MIN_DENOMINATOR)
+
+        self._voxel_indicy_denmominator = self._res + MIN_DENOMINATOR
+        self._dims_m_one_zeros = torch.zeros_like(self._dims_m_one) # lower voxel index bounds (0)
+
+        batch_indices = torch.arange(self._batch_size, dtype=torch.int,
+                                     device=device).view(self._batch_size, 1, 1)
+        self._tiled_batch_indices = batch_indices.repeat(
+            [1, self._num_coords, 1])
+
+        w = self._voxel_shape[0] + 2
+        arange = torch.arange(0, w, dtype=torch.float, device=device)
+        self._index_grid = torch.cat([
+            arange.view(w, 1, 1, 1).repeat([1, w, w, 1]),
+            arange.view(1, w, 1, 1).repeat([w, 1, w, 1]),
+            arange.view(1, 1, w, 1).repeat([w, w, 1, 1])], dim=-1).unsqueeze(
+            0).repeat([self._batch_size, 1, 1, 1, 1])
+
+    def _broadcast(self, src: torch.Tensor, other: torch.Tensor, dim: int):
+        if dim < 0:
+            dim = other.dim() + dim
+        if src.dim() == 1:
+            for _ in range(0, dim):
+                src = src.unsqueeze(0)
+        for _ in range(src.dim(), other.dim()):
+            src = src.unsqueeze(-1)
+        src = src.expand_as(other)
+        return src
+
+    def _scatter_mean(self, src: torch.Tensor, index: torch.Tensor, out: torch.Tensor,
+                      dim: int = -1):
+        # scatter pcd features to voxel features by taking the mean
+        out = out.scatter_add_(dim, index, src)
+
+        index_dim = dim
+        if index_dim < 0:
+            index_dim = index_dim + src.dim()
+        if index.dim() <= index_dim:
+            index_dim = index.dim() - 1
+
+        ones = torch.ones(index.size(), dtype=src.dtype, device=src.device)
+        out_count = torch.zeros(out.size(), dtype=out.dtype, device=out.device)
+        out_count = out_count.scatter_add_(index_dim, index, ones)
+        out_count.clamp_(1)
+        count = self._broadcast(out_count, out, dim)
+        if torch.is_floating_point(out):
+            out.true_divide_(count)
+        else:
+            out.floor_divide_(count)
+        return out
+
+    def _scatter_nd(self, indices, updates):
+        # scatter the array of voxel indices and values to the voxel grid
+        indices_shape = indices.shape
+        num_index_dims = indices_shape[-1]
+        flat_updates = updates.view((-1,))
+        indices_scales = self._result_dim_sizes[0:num_index_dims].view(
+            [1] * (len(indices_shape) - 1) + [num_index_dims])
+        indices_for_flat_tiled = ((indices * indices_scales).sum(
+            dim=-1, keepdims=True)).view(-1, 1).repeat(
+            *[1, self._voxel_feature_size])
+
+        implicit_indices = self._arange_to_max_coords[
+                           :self._voxel_feature_size].unsqueeze(0).repeat(
+            *[indices_for_flat_tiled.shape[0], 1])
+        indices_for_flat = indices_for_flat_tiled + implicit_indices
+        flat_indices_for_flat = indices_for_flat.view((-1,)).long()
+
+        flat_scatter = self._scatter_mean(
+            flat_updates, flat_indices_for_flat,
+            out=torch.zeros_like(self._flat_output))
+        return flat_scatter.view(self._total_dims_list)
+
+    def _clamp_voxel_seg_grid(self, voxel_indices: torch.Tensor, pcd_seg: torch.Tensor, voxel_size: int):
+        # clamp point cloud segmentations to voxel segmentations
+        B, _, _ = voxel_indices.shape
+        dims = torch.tensor([voxel_size] * 3, device=voxel_indices.device).int()        
+        flat_voxel_indices = voxel_indices[..., 0] * (dims[1] * dims[2]) + voxel_indices[..., 1] * dims[2] + voxel_indices[..., 2]
+        flat_voxel_indices = flat_voxel_indices.view(B, -1)
+        
+        # Create a tensor to record voxel classes
+        num_classes = pcd_seg.max().item() + 1
+        class_counts = torch.zeros(B, voxel_size * voxel_size * voxel_size, num_classes, device=voxel_indices.device, dtype=torch.float)
+
+        # Create one-hot encodings of class indices
+        one_hot_class_indices = F.one_hot(pcd_seg.long(), num_classes=num_classes).float()
+        one_hot_class_indices = one_hot_class_indices.squeeze(2)
+
+        # Add one-hot encodings to the class counts based on voxel indices
+        class_counts.scatter_add_(1, flat_voxel_indices.unsqueeze(-1).expand(-1, -1, num_classes).to(torch.int64), 
+                                  one_hot_class_indices)
+        
+        # Find the class with the maximum count in each voxel
+        class_counts = class_counts.view(B, voxel_size, voxel_size, voxel_size, num_classes)
+        voxel_class_indices = class_counts.argmax(dim=-1).unsqueeze(-1)
+        return voxel_class_indices # [B, voxel_size, voxel_size, voxel_size, 1]
+    
+    def coords_to_bounding_voxel_grid(self, coords: torch.Tensor, coord_features=None,
+                                      coord_bounds=None, clamp_vox_id=False, pcd_seg=None):
+        # converting a batch of pcds with its features to a batch of voxels
+        voxel_indicy_denmominator = self._voxel_indicy_denmominator
+        res, bb_mins = self._res, self._bb_mins
+        if coord_bounds is not None:
+            bb_mins = coord_bounds[..., 0:3]
+            bb_maxs = coord_bounds[..., 3:6]
+            bb_ranges = bb_maxs - bb_mins
+            res = bb_ranges / (self._dims_orig.float() + MIN_DENOMINATOR) # reduce res so that farthest-side edge points can be clipped
+            voxel_indicy_denmominator = res + MIN_DENOMINATOR
+        bb_mins_shifted = bb_mins - res  
+        floor = torch.floor( # use back-shifted bb_min so that 0-side edge points can be clipped
+            (coords - bb_mins_shifted.unsqueeze(1)) / voxel_indicy_denmominator.unsqueeze(1)).int()
+        
+        # get voxel indices before clipping (ranging [0, self._voxel_size+1])
+        voxel_indices = torch.min(floor, self._dims_m_one) # clip into the maximal voxel index bounds
+        voxel_indices = torch.max(voxel_indices, self._dims_m_one_zeros) # clip into the minimal voxel index bounds ((0,0,0))
+
+        # global-coordinate point cloud (x, y, z)
+        voxel_values = coords
+
+        # rgb values (R, G, B)
+        if coord_features is not None:
+            voxel_values = torch.cat([voxel_values, coord_features], -1) # concat rgb values (B, 128, 128, 3)
+
+        # coordinates to aggregate over
+        _, num_coords, _ = voxel_indices.shape
+        all_indices = torch.cat([ # tile voxel index with batch index, so that different scenes in different batches don't mix
+            self._tiled_batch_indices[:, :num_coords], voxel_indices], -1) 
+
+        # count point cloud occupancy
+        voxel_values_pruned_flat = torch.cat(
+            [voxel_values, self._ones_max_coords[:, :num_coords]], -1)
+
+        # scatter to get the voxel grid 
+        scattered = self._scatter_nd(
+            all_indices.view([-1, 1 + 3]),
+            voxel_values_pruned_flat.view(-1, self._voxel_feature_size)) # 3+4=7
+        vox = scattered[:, 1:-1, 1:-1, 1:-1] # clip the edges
+        if INCLUDE_PER_VOXEL_COORD:
+            res_expanded = res.unsqueeze(1).unsqueeze(1).unsqueeze(1)
+            res_centre = (res_expanded * self._index_grid) + res_expanded / 2.0
+            coord_positions = (res_centre + bb_mins_shifted.unsqueeze(
+                1).unsqueeze(1).unsqueeze(1))[:, 1:-1, 1:-1, 1:-1] # compute each voxel's 3d position
+            vox = torch.cat([vox[..., :-1], coord_positions, vox[..., -1:]], -1)
+
+        # occupancy grid
+        occupied = (vox[..., -1:] > 0).float()
+        vox = torch.cat([
+            vox[..., :-1], occupied], -1)
+
+        # hard normalized voxel-location position encoding
+        vox = torch.cat( 
+           [vox[..., :-1], self._index_grid[:, :-2, :-2, :-2] / self._voxel_d,
+            vox[..., -1:]], -1) 
+        
+        # add clamped voxel id if applicable
+        if clamp_vox_id:
+            assert pcd_seg != None, "Please provide the point cloud segmentations for clamping to voxel segmentations"
+            vox_seg_grid = self._clamp_voxel_seg_grid(voxel_indices, pcd_seg, self._voxel_size+2)
+            vox_seg_grid = vox_seg_grid[:, 1:-1, 1:-1, 1:-1]
+            vox = torch.cat([vox[..., :-1], vox_seg_grid,
+                vox[..., -1:]], -1) 
+            
+        # voxel 11D features contain: 3 (pcd xyz coordinates) + 3 (rgb) + 3 (voxel xyz indices) + 1 (seg id if applicable) + 1 (occupancy)
+        return vox
+
+    
+
diff --git a/mani_skill/examples/demo_vis_rgbd.py b/mani_skill/examples/demo_vis_rgbd.py
index 961b437a9..84cda3ebb 100644
--- a/mani_skill/examples/demo_vis_rgbd.py
+++ b/mani_skill/examples/demo_vis_rgbd.py
@@ -67,18 +67,21 @@ def main(args):
         action = env.action_space.sample()
         obs, reward, terminated, truncated, info = env.step(action)
         cam_num = 0
-        imgs=[]
+        rgbImgs=[]
+        depthImgs=[]
         for cam in obs["sensor_data"].keys():
             if "rgb" in obs["sensor_data"][cam]:
                 rgb = common.to_numpy(obs["sensor_data"][cam]["rgb"][0])
-                imgs.append(rgb)
-                if "depth" in obs["sensor_data"][cam]:
-                    depth = common.to_numpy(obs["sensor_data"][cam]["depth"][0]).astype(np.float32)
-                    depth = depth / (depth.max() - depth.min())
-                    depth_rgb = np.zeros_like(rgb)
-                    depth_rgb[..., :] = depth*255
-                    imgs.append(depth_rgb)
+                depth = common.to_numpy(obs["sensor_data"][cam]["depth"][0]).astype(np.float32)
+                depth = depth / (depth.max() - depth.min())
+                rgbImgs.append(rgb)
+                depth_rgb = np.zeros_like(rgb)
+                depth_rgb[..., :] = depth*255
+                depthImgs.append(depth_rgb)
+
                 cam_num += 1
+
+        imgs = rgbImgs + depthImgs
         img = visualization.tile_images(imgs, nrows=n_cams)
         renderer(img)
 
diff --git a/mani_skill/examples/demo_vis_voxel.py b/mani_skill/examples/demo_vis_voxel.py
new file mode 100644
index 000000000..894bb6ecc
--- /dev/null
+++ b/mani_skill/examples/demo_vis_voxel.py
@@ -0,0 +1,122 @@
+import argparse
+
+import gymnasium as gym
+import numpy as np
+
+from mani_skill.envs.sapien_env import BaseEnv
+from mani_skill.utils.visualization.voxel_visualizer import visualise_voxel
+import torch
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import cv2
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--env-id", type=str, default="PushCube-v1", help="The environment ID of the task you want to simulate")
+    parser.add_argument("--use-default", type=bool, default=False, help="Whether or not to use default task observation mode configs. If yes, no override will take place")
+    parser.add_argument("--device", type=str, default="cuda", help="The device on which voxelization is done. Should be either cuda or cpu")
+    parser.add_argument("--voxel-size", type=int, default=200, help="The number of voxels per side")
+    parser.add_argument("--zoom-factor", type=float, default=1.5, help="Zoom-in factor of the camera when generating the output voxel visualizations")
+    parser.add_argument("--segmentation", type=bool, default=False, action=argparse.BooleanOptionalAction, help="Whether or not to include voxel segmentation estimations")
+    parser.add_argument("--rotation-amount", type=float, default=45, help="The amount of rotation of camera for filming the voxelized scene")
+    parser.add_argument("--cam-width", type=int, default=720, help="Override the width of every camera in the environment")
+    parser.add_argument("--cam-height", type=int, default=480, help="Override the height of every camera in the environment")
+    parser.add_argument(
+        "--coord-bounds",
+        nargs="*",
+        type=float,
+        default=[-1, -1, -1, 2, 2, 2],
+        help="The bounds of the 3D points' coordinates to be voxelized"
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        help="Seed the random actions and environment. Default is no seed",
+    )
+    args = parser.parse_args()
+    return args
+
+def render_filtered_voxels(voxel_grid, zf=1.0, rotation_amount=60, env_id="PushCube-v1"):
+    if (env_id in ["TwoRobotPickCube-v1", "TwoRobotStackCube-v1"]):
+        # different envs have different floor id
+        flood_id = 32
+    else:
+        flood_id = 17
+    vis_voxel_grid = voxel_grid.permute(0, 4, 1, 2, 3).detach().cpu().numpy()
+    floor_map = (vis_voxel_grid[:, 9, ...] == flood_id)
+    floor_map = torch.tensor(floor_map)
+    floor_map = floor_map.unsqueeze(1).repeat(1, 11, 1, 1, 1)
+    vis_voxel_grid[floor_map] = 0
+
+    rendered_img = visualise_voxel(vis_voxel_grid[0],
+                                None,
+                                None,
+                                voxel_size=0.01,
+                                zoom_factor=zf,
+                                rotation_amount=np.deg2rad(rotation_amount))
+    return rendered_img
+
+def render_all_voxels(voxel_grid, zf=1.0, rotation_amount=60):
+    vis_voxel_grid = voxel_grid.permute(0, 4, 1, 2, 3).detach().cpu().numpy()
+    rendered_img = visualise_voxel(vis_voxel_grid[0],
+                                None,
+                                None,
+                                voxel_size=0.01,
+                                zoom_factor=zf,
+                                rotation_amount=np.deg2rad(rotation_amount))
+    return rendered_img
+
+def main(args):
+    if args.seed is not None:
+        np.random.seed(args.seed)
+    sensor_configs = dict()
+    if args.cam_width:
+        sensor_configs["width"] = args.cam_width
+    if args.cam_height:
+        sensor_configs["height"] = args.cam_height
+    obs_mode_config = {"coord_bounds": args.coord_bounds,
+                    "voxel_size": args.voxel_size, 
+                    "device": torch.device(args.device),
+                    "segmentation": args.segmentation}
+    # init the environment
+    if not args.use_default:
+        env: BaseEnv = gym.make(
+            args.env_id,
+            obs_mode="voxel",
+            reward_mode="none",
+            obs_mode_config=obs_mode_config,
+            sensor_configs=sensor_configs,
+        )
+    else:
+        # Not overriding obs_mode_config. This works only for some tabletop tasks like PushCube-v1 and PickCube-v1
+        env: BaseEnv = gym.make(
+            args.env_id,
+            obs_mode="voxel",
+        )
+    
+    # Interactively show the voxelized scene
+    zf = args.zoom_factor # controlling camera zoom-ins
+    obs, _ = env.reset()
+    print(f"Shape of the voxel grids: {obs['voxel_grid'].shape}")
+    while True:
+        action = env.action_space.sample()
+        obs, reward, terminated, truncated, info = env.step(action)
+        voxel_grid = obs["voxel_grid"]
+        if args.segmentation:
+            # if voxel seg id is available, filter the floors and then render the scene voxels
+            img = render_filtered_voxels(voxel_grid, zf, args.rotation_amount, args.env_id)
+        else:
+            # if voxel seg id not available, just render the scene voxels
+            img = render_all_voxels(voxel_grid, zf, args.rotation_amount)
+        plt.axis('off')
+        plt.imshow(img)
+        plt.show()
+        if terminated or truncated:
+            break
+    env.close()
+
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/mani_skill/utils/visualization/__init__.py b/mani_skill/utils/visualization/__init__.py
index 48e913fd6..9477ea7d6 100644
--- a/mani_skill/utils/visualization/__init__.py
+++ b/mani_skill/utils/visualization/__init__.py
@@ -1,3 +1,4 @@
 from .jupyter_utils import display_images
 from .misc import images_to_video, put_info_on_image, put_text_on_image, tile_images
 from .renderer import ImageRenderer
+from .voxel_visualizer import *
diff --git a/mani_skill/utils/visualization/voxel_visualizer.py b/mani_skill/utils/visualization/voxel_visualizer.py
new file mode 100644
index 000000000..d1867c087
--- /dev/null
+++ b/mani_skill/utils/visualization/voxel_visualizer.py
@@ -0,0 +1,200 @@
+# Borrowed and modified from https://github.com/stepjam/ARM/blob/main/arm/utils.py
+
+import numpy as np
+import pyrender
+import torch
+import trimesh
+from pyrender.trackball import Trackball
+from scipy.spatial.transform import Rotation
+
+# SCALE_FACTOR = DEPTH_SCALE
+DEFAULT_SCENE_SCALE = 2.0
+
+
+def loss_weights(replay_sample, beta=1.0):
+    loss_weights = 1.0
+    if 'sampling_probabilities' in replay_sample:
+        probs = replay_sample['sampling_probabilities']
+        loss_weights = 1.0 / torch.sqrt(probs + 1e-10)
+        loss_weights = (loss_weights / torch.max(loss_weights)) ** beta
+    return loss_weights
+
+
+def soft_updates(net, target_net, tau):
+    for param, target_param in zip(net.parameters(), target_net.parameters()):
+        target_param.data.copy_(
+            tau * param.data + (1 - tau) * target_param.data
+        )
+
+
+def stack_on_channel(x):
+    # expect (B, T, C, ...)
+    return torch.cat(torch.split(x, 1, dim=1), dim=2).squeeze(1)
+
+
+def normalize_quaternion(quat):
+    return np.array(quat) / np.linalg.norm(quat, axis=-1, keepdims=True)
+
+
+def quaternion_to_discrete_euler(quaternion, resolution):
+    euler = Rotation.from_quat(quaternion).as_euler('xyz', degrees=True) + 180
+    assert np.min(euler) >= 0 and np.max(euler) <= 360
+    disc = np.around((euler / resolution)).astype(int)
+    disc[disc == int(360 / resolution)] = 0
+    return disc
+
+
+def discrete_euler_to_quaternion(discrete_euler, resolution):
+    euluer = (discrete_euler * resolution) - 180
+    return Rotation.from_euler('xyz', euluer, degrees=True).as_quat()
+
+
+def point_to_voxel_index(
+        point: np.ndarray,
+        voxel_size: np.ndarray,
+        coord_bounds: np.ndarray):
+    bb_mins = np.array(coord_bounds[0:3])
+    bb_maxs = np.array(coord_bounds[3:])
+    dims_m_one = np.array([voxel_size] * 3) - 1
+    bb_ranges = bb_maxs - bb_mins
+    res = bb_ranges / (np.array([voxel_size] * 3) + 1e-12)
+    voxel_indicy = np.minimum(
+        np.floor((point - bb_mins) / (res + 1e-12)).astype(
+            np.int32), dims_m_one)
+    return voxel_indicy
+
+
+def point_to_pixel_index(
+        point: np.ndarray,
+        extrinsics: np.ndarray,
+        intrinsics: np.ndarray):
+    point = np.array([point[0], point[1], point[2], 1])
+    world_to_cam = np.linalg.inv(extrinsics)
+    point_in_cam_frame = world_to_cam.dot(point)
+    px, py, pz = point_in_cam_frame[:3]
+    px = 2 * intrinsics[0, 2] - int(-intrinsics[0, 0] * (px / pz) + intrinsics[0, 2])
+    py = 2 * intrinsics[1, 2] - int(-intrinsics[1, 1] * (py / pz) + intrinsics[1, 2])
+    return px, py
+
+
+def _compute_initial_camera_pose(scene):
+    # Adapted from:
+    # https://github.com/mmatl/pyrender/blob/master/pyrender/viewer.py#L1032
+    centroid = scene.centroid
+    scale = scene.scale
+    if scale == 0.0:
+        scale = DEFAULT_SCENE_SCALE
+    s2 = 1.0 / np.sqrt(2.0)
+    cp = np.eye(4)
+    cp[:3, :3] = np.array([[0.0, -s2, s2], [1.0, 0.0, 0.0], [0.0, s2, s2]])
+    hfov = np.pi / 6.0
+    dist = scale / (2.0 * np.tan(hfov))
+    cp[:3, 3] = dist * np.array([1.0, 0.0, 1.0]) + centroid
+    return cp
+
+
+def _from_trimesh_scene(
+        trimesh_scene, bg_color=None, ambient_light=None):
+    # convert trimesh geometries to pyrender geometries
+    geometries = {name: pyrender.Mesh.from_trimesh(geom, smooth=False)
+                  for name, geom in trimesh_scene.geometry.items()}
+    # create the pyrender scene object
+    scene_pr = pyrender.Scene(bg_color=bg_color, ambient_light=ambient_light)
+    # add every node with geometry to the pyrender scene
+    for node in trimesh_scene.graph.nodes_geometry:
+        pose, geom_name = trimesh_scene.graph[node]
+        scene_pr.add(geometries[geom_name], pose=pose)
+    return scene_pr
+
+
+def _create_bounding_box(scene, voxel_size, res):
+    l = voxel_size * res
+    T = np.eye(4)
+    w = 0.01
+    for trans in [[0, 0, l / 2], [0, l, l / 2], [l, l, l / 2], [l, 0, l / 2]]:
+        T[:3, 3] = np.array(trans) - voxel_size / 2
+        scene.add_geometry(trimesh.creation.box(
+            [w, w, l], T, face_colors=[0, 0, 0, 255]))
+    for trans in [[l / 2, 0, 0], [l / 2, 0, l], [l / 2, l, 0], [l / 2, l, l]]:
+        T[:3, 3] = np.array(trans) - voxel_size / 2
+        scene.add_geometry(trimesh.creation.box(
+            [l, w, w], T, face_colors=[0, 0, 0, 255]))
+    for trans in [[0, l / 2, 0], [0, l / 2, l], [l, l / 2, 0], [l, l / 2, l]]:
+        T[:3, 3] = np.array(trans) - voxel_size / 2
+        scene.add_geometry(trimesh.creation.box(
+            [w, l, w], T, face_colors=[0, 0, 0, 255]))
+
+
+def create_voxel_scene(
+        voxel_grid: np.ndarray,
+        q_attention: np.ndarray = None,
+        highlight_coordinate: np.ndarray = None,
+        highlight_alpha: float = 1.0,
+        voxel_size: float = 0.1,
+        show_bb: bool = False,
+        alpha: float = 0.5):
+    _, d, h, w = voxel_grid.shape
+    v = voxel_grid.transpose((1, 2, 3, 0))
+    occupancy = v[:, :, :, -1] != 0
+    alpha = np.expand_dims(np.full_like(occupancy, alpha, dtype=np.float32), -1)
+    rgb = np.concatenate([(v[:, :, :, 3:6] + 1)/ 2.0, alpha], axis=-1)
+
+    if q_attention is not None:
+        q = np.max(q_attention, 0)
+        q = q / np.max(q)
+        show_q = (q > 0.75)
+        occupancy = (show_q + occupancy).astype(bool)
+        q = np.expand_dims(q - 0.5, -1)  # Max q can be is 0.9
+        q_rgb = np.concatenate([
+            q, np.zeros_like(q), np.zeros_like(q),
+            np.clip(q, 0, 1)], axis=-1)
+        rgb = np.where(np.expand_dims(show_q, -1), q_rgb, rgb)
+
+    if highlight_coordinate is not None:
+        x, y, z = highlight_coordinate
+        occupancy[x, y, z] = True
+        rgb[x, y, z] = [1.0, 0.0, 0.0, highlight_alpha]
+
+    transform = trimesh.transformations.scale_and_translate(
+        scale=voxel_size, translate=(0.0, 0.0, 0.0))
+    trimesh_voxel_grid = trimesh.voxel.VoxelGrid(
+        encoding=occupancy, transform=transform)
+    geometry = trimesh_voxel_grid.as_boxes(colors=rgb)
+    scene = trimesh.Scene()
+    scene.add_geometry(geometry)
+    if show_bb:
+        assert d == h == w
+        _create_bounding_box(scene, voxel_size, d)
+    return scene
+
+
+def visualise_voxel(voxel_grid: np.ndarray,
+                    q_attention: np.ndarray = None,
+                    highlight_coordinate: np.ndarray = None,
+                    highlight_alpha: float = 1.0,
+                    rotation_amount: float = 0.0,
+                    show: bool = False,
+                    voxel_size: float = 0.1,
+                    offscreen_renderer: pyrender.OffscreenRenderer = None,
+                    show_bb: bool = False,
+                    zoom_factor: float = 1.0):
+    scene = create_voxel_scene(
+        voxel_grid, q_attention, highlight_coordinate,
+        highlight_alpha, voxel_size, show_bb)
+    if show:
+        scene.show()
+    else:
+        r = offscreen_renderer or pyrender.OffscreenRenderer(
+            viewport_width=640, viewport_height=480, point_size=1.0)
+        s = _from_trimesh_scene(
+            scene, ambient_light=[0.8, 0.8, 0.8],
+            bg_color=[1.0, 1.0, 1.0])
+        cam = pyrender.PerspectiveCamera(
+            yfov=np.pi / 4.0  / zoom_factor, aspectRatio=r.viewport_width/r.viewport_height)
+        p = _compute_initial_camera_pose(s)
+        t = Trackball(p, (r.viewport_width, r.viewport_height), s.scale, s.centroid)
+        t.rotate(rotation_amount, np.array([0.0, 0.0, 1.0]))
+        s.add(cam, pose=t.pose)
+        color, depth = r.render(s)
+        return color.copy()
+    
\ No newline at end of file