diff --git a/zsos/mapping/base_map.py b/zsos/mapping/base_map.py index 165c1b7..02984b1 100644 --- a/zsos/mapping/base_map.py +++ b/zsos/mapping/base_map.py @@ -3,11 +3,6 @@ import numpy as np from zsos.mapping.traj_visualizer import TrajectoryVisualizer -from zsos.utils.geometry_utils import extract_yaw -from zsos.utils.img_utils import ( - place_img_in_img, - rotate_image, -) class BaseMap: @@ -16,29 +11,12 @@ class BaseMap: _last_camera_yaw: float = None _map_dtype: np.dtype = np.float32 - def __init__( - self, - fov: float, - min_depth: float, - max_depth: float, - size: int = 1000, - *args, - **kwargs, - ): + def __init__(self, size: int = 1000, *args, **kwargs): """ Args: - num_channels: The number of channels in the value map. - fov: The field of view of the camera in degrees. - max_depth: The desired maximum depth of the camera in meters. - use_max_confidence: Whether to use the maximum confidence value in the value - map or a weighted average confidence value. + size: The size of the map in pixels. """ self.pixels_per_meter = 20 - - self._fov = np.deg2rad(fov) - self._min_depth = min_depth - self._max_depth = max_depth - self._map = np.zeros((size, size), dtype=self._map_dtype) self._episode_pixel_origin = np.array([size // 2, size // 2]) self._traj_vis = TrajectoryVisualizer( @@ -52,36 +30,6 @@ def reset(self): self._episode_pixel_origin, self.pixels_per_meter ) - def _localize_new_data( - self, depth: np.ndarray, tf_camera_to_episodic: np.ndarray - ) -> np.ndarray: - # Get new portion of the map - curr_data = self._process_local_data(depth, tf_camera_to_episodic) - - # Rotate this new data to match the camera's orientation - self._last_camera_yaw = yaw = extract_yaw(tf_camera_to_episodic) - curr_data = rotate_image(curr_data, -yaw) - - # Determine where this mask should be overlaid - cam_x, cam_y = tf_camera_to_episodic[:2, 3] / tf_camera_to_episodic[3, 3] - self._camera_positions.append(np.array([cam_x, cam_y])) - - # Convert to pixel units - px = int(cam_x * self.pixels_per_meter) + self._episode_pixel_origin[0] - py = int(-cam_y * self.pixels_per_meter) + self._episode_pixel_origin[1] - - # Overlay the new data onto the map - curr_map = np.zeros_like(self._map) - curr_map = place_img_in_img(curr_map, curr_data, px, py) - - return curr_map - - def _process_local_data( - self, depth: np.ndarray, tf_camera_to_episodic: np.ndarray - ) -> np.ndarray: - """Processes the local data (depth image) to be used for updating the map.""" - raise NotImplementedError - def _xy_to_px(self, points: np.ndarray) -> np.ndarray: """Converts an array of (x, y) coordinates to pixel coordinates. diff --git a/zsos/mapping/object_point_cloud_map.py b/zsos/mapping/object_point_cloud_map.py index acd5293..824c258 100644 --- a/zsos/mapping/object_point_cloud_map.py +++ b/zsos/mapping/object_point_cloud_map.py @@ -9,28 +9,10 @@ class ObjectPointCloudMap: clouds: Dict[str, np.ndarray] = {} - _image_width: int = None # set upon execution of update_map method - _image_height: int = None # set upon execution of update_map method - __fx: float = None # set upon execution of update_map method - def __init__( - self, min_depth: float, max_depth: float, hfov: float, erosion_size: float - ) -> None: - self._min_depth = min_depth - self._max_depth = max_depth - self._hfov = np.deg2rad(hfov) + def __init__(self, erosion_size: float) -> None: self._erosion_size = erosion_size - @property - def _fx(self) -> float: - if self.__fx is None: - self.__fx = self._image_width / (2 * np.tan(self._hfov / 2)) - return self.__fx - - @property - def _fy(self) -> float: - return self._fx - def reset(self): self.clouds = {} @@ -43,14 +25,19 @@ def update_map( depth_img: np.ndarray, object_mask: np.ndarray, tf_camera_to_episodic: np.ndarray, + min_depth: float, + max_depth: float, + fx: float, + fy: float, ) -> None: """Updates the object map with the latest information from the agent.""" - self._image_height, self._image_width = depth_img.shape - local_cloud = self._extract_object_cloud(depth_img, object_mask) + local_cloud = self._extract_object_cloud( + depth_img, object_mask, min_depth, max_depth, fx, fy + ) # Mark all points of local_cloud whose distance from the camera is too far # as being out of range - within_range = local_cloud[:, 0] <= self._max_depth * 0.95 # 5% margin + within_range = local_cloud[:, 0] <= max_depth * 0.95 # 5% margin global_cloud = transform_points(tf_camera_to_episodic, local_cloud) global_cloud = np.concatenate((global_cloud, within_range[:, None]), axis=1) @@ -87,17 +74,21 @@ def get_target_cloud(self, target_class: str) -> np.ndarray: return target_cloud def _extract_object_cloud( - self, depth: np.ndarray, object_mask: np.ndarray + self, + depth: np.ndarray, + object_mask: np.ndarray, + min_depth: float, + max_depth: float, + fx: float, + fy: float, ) -> np.ndarray: final_mask = object_mask * 255 final_mask = cv2.erode(final_mask, None, iterations=self._erosion_size) valid_depth = depth.copy() valid_depth[valid_depth == 0] = 1 # set all holes (0) to just be far (1) - valid_depth = ( - valid_depth * (self._max_depth - self._min_depth) + self._min_depth - ) - cloud = get_point_cloud(valid_depth, final_mask, self._fx, self._fy) + valid_depth = valid_depth * (max_depth - min_depth) + min_depth + cloud = get_point_cloud(valid_depth, final_mask, fx, fy) cloud = open3d_dbscan_filtering(cloud) return cloud diff --git a/zsos/mapping/obstacle_map.py b/zsos/mapping/obstacle_map.py index f68fac8..586427b 100644 --- a/zsos/mapping/obstacle_map.py +++ b/zsos/mapping/obstacle_map.py @@ -22,24 +22,19 @@ class ObstacleMap(BaseMap): def __init__( self, - fov: float, - min_depth: float, - max_depth: float, min_height: float, max_height: float, agent_radius: float, area_thresh: float = 3.0, # square meters size: int = 1000, ): - super().__init__(fov, min_depth, max_depth, size) + super().__init__(size) self._map = np.zeros((size, size), dtype=bool) self._navigable_map = np.zeros((size, size), dtype=bool) self._explored_area = np.zeros((size, size), dtype=bool) - self._hfov = np.deg2rad(fov) self._min_height = min_height self._max_height = max_height self._area_thresh_in_pixels = area_thresh * (self.pixels_per_meter**2) - self.__fx = None kernel_size = self.pixels_per_meter * agent_radius * 2 # round kernel_size to nearest odd number kernel_size = int(kernel_size) + (int(kernel_size) % 2 == 0) @@ -52,23 +47,39 @@ def reset(self): self._frontiers_px = np.array([]) self.frontiers = np.array([]) - @property - def _fx(self) -> float: - if self.__fx is None: - self.__fx = self._image_width / (2 * np.tan(self._hfov / 2)) - return self.__fx - - @property - def _fy(self) -> float: - return self._fx - - def update_map(self, depth: np.ndarray, tf_camera_to_episodic: np.ndarray): - """Adds all obstacles from the current view to the map. Also updates the area + def update_map( + self, + depth: np.ndarray, + tf_camera_to_episodic: np.ndarray, + min_depth: float, + max_depth: float, + fx: float, + fy: float, + topdown_fov: float, + ): + """ + Adds all obstacles from the current view to the map. Also updates the area that the robot has explored so far. + + Args: + depth (np.ndarray): The depth image to use for updating the object map. It + is normalized to the range [0, 1] and has a shape of (height, width). + + tf_camera_to_episodic (np.ndarray): The transformation matrix from the + camera to the episodic coordinate frame. + min_depth (float): The minimum depth value (in meters) of the depth image. + max_depth (float): The maximum depth value (in meters) of the depth image. + fx (float): The focal length of the camera in the x direction. + fy (float): The focal length of the camera in the y direction. + topdown_fov (float): The field of view of the depth camera projected onto + the topdown map. """ if self._image_width is None: self._image_width = depth.shape[1] - point_cloud_camera_frame = self._get_local_point_cloud(depth) + + scaled_depth = depth * (max_depth - min_depth) + min_depth + mask = scaled_depth < max_depth + point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, fx, fy) point_cloud_episodic_frame = transform_points( tf_camera_to_episodic, point_cloud_camera_frame ) @@ -99,8 +110,8 @@ def update_map(self, depth: np.ndarray, tf_camera_to_episodic: np.ndarray): current_fog_of_war_mask=self._explored_area.astype(np.uint8), current_point=agent_pixel_location[::-1], current_angle=-self._last_camera_yaw, - fov=np.rad2deg(self._hfov), - max_line_len=self._max_depth * self.pixels_per_meter, + fov=np.rad2deg(topdown_fov), + max_line_len=max_depth * self.pixels_per_meter, ) # Compute frontier locations @@ -147,12 +158,6 @@ def visualize(self): return vis_img - def _get_local_point_cloud(self, depth: np.ndarray) -> np.ndarray: - scaled_depth = depth * (self._max_depth - self._min_depth) + self._min_depth - mask = scaled_depth < self._max_depth - point_cloud = get_point_cloud(scaled_depth, mask, self._fx, self._fy) - return point_cloud - def filter_points_by_height( points: np.ndarray, min_height: float, max_height: float @@ -167,11 +172,8 @@ def replay_from_dir(): data = json.load(f) v = ObstacleMap( - fov=kwargs["fov"], min_height=float(kwargs.get("min_height", 0.15)), max_height=float(kwargs.get("max_height", 0.88)), - min_depth=kwargs["min_depth"], - max_depth=kwargs["max_depth"], agent_radius=float(kwargs.get("agent_radius", 0.18)), size=kwargs["size"], ) diff --git a/zsos/mapping/value_map.py b/zsos/mapping/value_map.py index 66fc533..e114282 100644 --- a/zsos/mapping/value_map.py +++ b/zsos/mapping/value_map.py @@ -11,10 +11,12 @@ import numpy as np from zsos.mapping.base_map import BaseMap -from zsos.utils.geometry_utils import get_rotation_matrix +from zsos.utils.geometry_utils import extract_yaw, get_rotation_matrix from zsos.utils.img_utils import ( monochannel_to_inferno_rgb, pixel_value_within_radius, + place_img_in_img, + rotate_image, ) DEBUG = False @@ -29,7 +31,7 @@ class ValueMap(BaseMap): """Generates a map representing how valuable explored regions of the environment are with respect to finding and navigating to the target object.""" - _confidence_mask: np.ndarray = None + _confidence_masks: Dict[Tuple[float, float], np.ndarray] = {} _camera_positions: List[np.ndarray] = [] _last_camera_yaw: float = None _min_confidence: float = 0.25 @@ -37,9 +39,6 @@ class ValueMap(BaseMap): def __init__( self, - fov: float, - min_depth: float, - max_depth: float, value_channels: int, size: int = 1000, use_max_confidence: bool = True, @@ -47,12 +46,11 @@ def __init__( """ Args: value_channels: The number of channels in the value map. - fov: The field of view of the camera in degrees. - max_depth: The desired maximum depth of the camera in meters. + size: The size of the value map in pixels. use_max_confidence: Whether to use the maximum confidence value in the value map or a weighted average confidence value. """ - super().__init__(fov, min_depth, max_depth, size) + super().__init__(size) self._value_map = np.zeros((size, size, value_channels), np.float32) self._value_channels = value_channels self._use_max_confidence = use_max_confidence @@ -68,9 +66,6 @@ def __init__( with open(KWARGS_JSON, "w") as f: json.dump( { - "fov": fov, - "min_depth": min_depth, - "max_depth": max_depth, "value_channels": value_channels, "size": size, "use_max_confidence": use_max_confidence, @@ -86,23 +81,34 @@ def reset(self): self._value_map.fill(0) def update_map( - self, depth: np.ndarray, tf_camera_to_episodic: np.ndarray, values: np.ndarray + self, + values: np.ndarray, + depth: np.ndarray, + tf_camera_to_episodic: np.ndarray, + min_depth: float, + max_depth: float, + fov: float, ): """Updates the value map with the given depth image, pose, and value to use. Args: + values: The value to use for updating the map. depth: The depth image to use for updating the map; expected to be already normalized to the range [0, 1]. tf_camera_to_episodic: The transformation matrix from the episodic frame to the camera frame. - values: The value to use for updating the map. + min_depth: The minimum depth value in meters. + max_depth: The maximum depth value in meters. + fov: The field of view of the camera in RADIANS. """ assert len(values) == self._value_channels, ( "Incorrect number of values given " f"({len(values)}). Expected {self._value_channels}." ) - curr_map = self._localize_new_data(depth, tf_camera_to_episodic) + curr_map = self._localize_new_data( + depth, tf_camera_to_episodic, min_depth, max_depth, fov + ) # Fuse the new data with the existing data self._fuse_new_data(curr_map, values) @@ -189,14 +195,13 @@ def visualize( return map_img def _process_local_data( - self, depth: np.ndarray, tf_camera_to_episodic: np.ndarray = None + self, depth: np.ndarray, fov: float, min_depth: float, max_depth: float ) -> np.ndarray: """Using the FOV and depth, return the visible portion of the FOV. Args: depth: The depth image to use for determining the visible portion of the FOV. - tf_camera_to_episodic: Currently unused for this subclass. Returns: A mask of the visible portion of the FOV. """ @@ -204,13 +209,10 @@ def _process_local_data( if len(depth.shape) == 3: depth = depth.squeeze(2) # Squash depth image into one row with the max depth value for each column - depth_row = ( - np.max(depth, axis=0) * (self._max_depth - self._min_depth) - + self._min_depth - ) + depth_row = np.max(depth, axis=0) * (max_depth - min_depth) + min_depth # Create a linspace of the same length as the depth row from -fov/2 to fov/2 - angles = np.linspace(-self._fov / 2, self._fov / 2, len(depth_row)) + angles = np.linspace(-fov / 2, fov / 2, len(depth_row)) # Assign each value in the row with an x, y coordinate depending on 'angles' # and the max depth value for that column @@ -218,7 +220,7 @@ def _process_local_data( y = depth_row * np.tan(angles) # Get blank cone mask - cone_mask = self._get_confidence_mask() + cone_mask = self._get_confidence_mask(fov, max_depth) # Convert the x, y coordinates to pixel coordinates x = (x * self.pixels_per_meter + cone_mask.shape[0] / 2).astype(int) @@ -267,39 +269,68 @@ def _process_local_data( return visible_mask - def _get_blank_cone_mask(self) -> np.ndarray: + def _localize_new_data( + self, + depth: np.ndarray, + tf_camera_to_episodic: np.ndarray, + min_depth: float, + max_depth: float, + fov: float, + ) -> np.ndarray: + # Get new portion of the map + curr_data = self._process_local_data(depth, fov, min_depth, max_depth) + + # Rotate this new data to match the camera's orientation + self._last_camera_yaw = yaw = extract_yaw(tf_camera_to_episodic) + curr_data = rotate_image(curr_data, -yaw) + + # Determine where this mask should be overlaid + cam_x, cam_y = tf_camera_to_episodic[:2, 3] / tf_camera_to_episodic[3, 3] + self._camera_positions.append(np.array([cam_x, cam_y])) + + # Convert to pixel units + px = int(cam_x * self.pixels_per_meter) + self._episode_pixel_origin[0] + py = int(-cam_y * self.pixels_per_meter) + self._episode_pixel_origin[1] + + # Overlay the new data onto the map + curr_map = np.zeros_like(self._map) + curr_map = place_img_in_img(curr_map, curr_data, px, py) + + return curr_map + + def _get_blank_cone_mask(self, fov: float, max_depth: float) -> np.ndarray: """Generate a FOV cone without any obstacles considered""" - size = int(self._max_depth * self.pixels_per_meter) + size = int(max_depth * self.pixels_per_meter) cone_mask = np.zeros((size * 2 + 1, size * 2 + 1)) cone_mask = cv2.ellipse( cone_mask, (size, size), # center_pixel (size, size), # axes lengths 0, # angle circle is rotated - -np.rad2deg(self._fov) / 2 + 90, # start_angle - np.rad2deg(self._fov) / 2 + 90, # end_angle + -np.rad2deg(fov) / 2 + 90, # start_angle + np.rad2deg(fov) / 2 + 90, # end_angle 1, # color -1, # thickness ) return cone_mask - def _get_confidence_mask(self) -> np.ndarray: + def _get_confidence_mask(self, fov: float, max_depth: float) -> np.ndarray: """Generate a FOV cone with central values weighted more heavily""" - if self._confidence_mask is not None: - return self._confidence_mask.copy() - cone_mask = self._get_blank_cone_mask() + if (fov, max_depth) in self._confidence_masks: + return self._confidence_masks[(fov, max_depth)].copy() + cone_mask = self._get_blank_cone_mask(fov, max_depth) adjusted_mask = np.zeros_like(cone_mask).astype(np.float32) for row in range(adjusted_mask.shape[0]): for col in range(adjusted_mask.shape[1]): horizontal = abs(row - adjusted_mask.shape[0] // 2) vertical = abs(col - adjusted_mask.shape[1] // 2) angle = np.arctan2(vertical, horizontal) - angle = remap(angle, 0, self._fov / 2, 0, np.pi / 2) + angle = remap(angle, 0, fov / 2, 0, np.pi / 2) confidence = np.cos(angle) ** 2 confidence = remap(confidence, 0, 1, self._min_confidence, 1) adjusted_mask[row, col] = confidence adjusted_mask = adjusted_mask * cone_mask - self._confidence_mask = adjusted_mask.copy() + self._confidence_masks[(fov, max_depth)] = adjusted_mask.copy() return adjusted_mask @@ -386,9 +417,6 @@ def replay_from_dir(): data = json.load(f) v = ValueMap( - fov=kwargs["fov"], - min_depth=float(kwargs.get("min_depth", 0.0)), - max_depth=float(kwargs["max_depth"]), value_channels=kwargs["value_channels"], size=kwargs["size"], use_max_confidence=kwargs["use_max_confidence"], @@ -413,9 +441,14 @@ def replay_from_dir(): # replay_from_dir() # quit() - v = ValueMap(fov=79, max_depth=5.0, value_channels=1) + v = ValueMap(value_channels=1) depth = cv2.imread("depth.png", cv2.IMREAD_GRAYSCALE).astype(np.float32) / 255.0 - img = v._process_local_data(depth) + img = v._process_local_data( + depth=depth, + fov=np.deg2rad(79), + min_depth=0.5, + max_depth=5.0, + ) cv2.imshow("img", (img * 255).astype(np.uint8)) cv2.waitKey(0) @@ -431,7 +464,14 @@ def replay_from_dir(): tf = np.eye(4) tf[:2, 3] = pt tf[:2, :2] = get_rotation_matrix(angle) - v.update_map(depth, tf, np.array([1])) + v.update_map( + np.array([1]), + depth, + tf, + min_depth=0.5, + max_depth=5.0, + fov=np.deg2rad(79), + ) img = v.visualize() cv2.imshow("img", img) key = cv2.waitKey(0) diff --git a/zsos/policy/base_objectnav_policy.py b/zsos/policy/base_objectnav_policy.py index 59667c0..2691e1b 100644 --- a/zsos/policy/base_objectnav_policy.py +++ b/zsos/policy/base_objectnav_policy.py @@ -39,9 +39,6 @@ def __init__( depth_image_shape: Tuple[int, int], det_conf_threshold: float, pointnav_stop_radius: float, - object_map_min_depth: float, - object_map_max_depth: float, - object_map_hfov: float, object_map_erosion_size: float, visualize: bool = True, compute_frontiers: bool = True, @@ -57,10 +54,7 @@ def __init__( self._mobile_sam = MobileSAMClient() self._pointnav_policy = WrappedPointNavResNetPolicy(pointnav_policy_path) self._object_map: ObjectPointCloudMap = ObjectPointCloudMap( - min_depth=object_map_min_depth, - max_depth=object_map_max_depth, - hfov=object_map_hfov, - erosion_size=object_map_erosion_size, + erosion_size=object_map_erosion_size ) self._depth_image_shape = tuple(depth_image_shape) self._det_conf_threshold = det_conf_threshold @@ -74,11 +68,8 @@ def __init__( self._compute_frontiers = compute_frontiers if compute_frontiers: self._obstacle_map = ObstacleMap( - fov=object_map_hfov, min_height=min_obstacle_height, max_height=max_obstacle_height, - min_depth=object_map_min_depth, - max_depth=object_map_max_depth, area_thresh=obstacle_map_area_threshold, agent_radius=agent_radius, ) @@ -112,9 +103,10 @@ def act( self._policy_info = {} + object_map_rgbd = self._observations_cache["object_map_rgbd"] detections = [ - self._update_object_map(rgb, depth, tf) - for rgb, depth, tf in self._observations_cache["object_map_rgbd"] + self._update_object_map(rgb, depth, tf, min_depth, max_depth, fx, fy) + for (rgb, depth, tf, min_depth, max_depth, fx, fy) in object_map_rgbd ] robot_xy = self._observations_cache["robot_xy"] goal = self._get_target_object_location(robot_xy) @@ -215,7 +207,12 @@ def _pointnav(self, goal: np.ndarray, deterministic=False, stop=False) -> Tensor it to determine the next action to take using the pre-trained pointnav policy. Args: - observations ("TensorDict"): The observations from the current timestep. + goal (np.ndarray): The goal to navigate to as (x, y), where x and y are in + meters. + deterministic (bool): Whether to use the deterministic or stochastic + policy. + stop (bool): Whether to stop if we are close enough to the goal. + """ masks = torch.tensor([self._num_steps != 0], dtype=torch.bool, device="cuda") if not np.array_equal(goal, self._last_goal): @@ -246,8 +243,35 @@ def _pointnav(self, goal: np.ndarray, deterministic=False, stop=False) -> Tensor return action def _update_object_map( - self, rgb: np.ndarray, depth: np.ndarray, tf_camera_to_episodic: np.ndarray + self, + rgb: np.ndarray, + depth: np.ndarray, + tf_camera_to_episodic: np.ndarray, + min_depth: float, + max_depth: float, + fx: float, + fy: float, ) -> ObjectDetections: + """ + Updates the object map with the given rgb and depth images, and the given + transformation matrix from the camera to the episodic coordinate frame. + + Args: + rgb (np.ndarray): The rgb image to use for updating the object map. Used for + object detection and Mobile SAM segmentation to extract better object + point clouds. + depth (np.ndarray): The depth image to use for updating the object map. It + is normalized to the range [0, 1] and has a shape of (height, width). + tf_camera_to_episodic (np.ndarray): The transformation matrix from the + camera to the episodic coordinate frame. + min_depth (float): The minimum depth value (in meters) of the depth image. + max_depth (float): The maximum depth value (in meters) of the depth image. + fx (float): The focal length of the camera in the x direction. + fy (float): The focal length of the camera in the y direction. + + Returns: + ObjectDetections: The object detections from the object detector. + """ detections = self._get_object_detections(rgb) height, width = rgb.shape[:2] self._object_masks = np.zeros((height, width), dtype=np.uint8) @@ -262,6 +286,10 @@ def _update_object_map( depth, object_mask, tf_camera_to_episodic, + min_depth, + max_depth, + fx, + fy, ) self._object_map.update_explored(tf_camera_to_episodic) @@ -284,10 +312,6 @@ class ZSOSConfig: depth_image_shape: Tuple[int, int] = (244, 224) det_conf_threshold: float = 0.6 pointnav_stop_radius: float = 0.9 - object_map_min_depth: float = 0.5 - object_map_max_depth: float = 5.0 - object_map_hfov: float = 79.0 - value_map_hfov: float = 79.0 object_map_proximity_threshold: float = 1.5 use_max_confidence: bool = False object_map_erosion_size: int = 5 diff --git a/zsos/policy/habitat_policies.py b/zsos/policy/habitat_policies.py index b077d14..f3c1832 100644 --- a/zsos/policy/habitat_policies.py +++ b/zsos/policy/habitat_policies.py @@ -19,6 +19,7 @@ from zsos.utils.geometry_utils import xyz_yaw_to_tf_matrix from zsos.vlm.grounding_dino import ObjectDetections +from ..mapping.obstacle_map import ObstacleMap from .base_objectnav_policy import BaseObjectNavPolicy, ZSOSConfig from .itm_policy import ITMPolicy, ITMPolicyV2, ITMPolicyV3 @@ -42,9 +43,23 @@ class HabitatMixin: _stop_action: Tensor = TorchActionIDs.STOP _start_yaw: Union[float, None] = None # must be set by _reset() method - def __init__(self, camera_height: float, *args: Any, **kwargs: Any) -> None: - self._camera_height = camera_height + def __init__( + self, + camera_height: float, + min_depth: float, + max_depth: float, + camera_fov: float, + image_width: int, + *args: Any, + **kwargs: Any, + ) -> None: super().__init__(*args, **kwargs) + self._camera_height = camera_height + self._min_depth = min_depth + self._max_depth = max_depth + camera_fov_rad = np.deg2rad(camera_fov) + self._camera_fov = camera_fov_rad + self._fx = self._fy = image_width / (2 * np.tan(camera_fov_rad / 2)) @classmethod def from_config(cls, config: DictConfig, *args_unused, **kwargs_unused): @@ -58,8 +73,10 @@ def from_config(cls, config: DictConfig, *args_unused, **kwargs_unused): kwargs["camera_height"] = sim_sensors_cfg.rgb_sensor.position[1] # Synchronize the mapping min/max depth values with the habitat config - kwargs["value_map_min_depth"] = sim_sensors_cfg.depth_sensor.min_depth - kwargs["value_map_max_depth"] = sim_sensors_cfg.depth_sensor.max_depth + kwargs["min_depth"] = sim_sensors_cfg.depth_sensor.min_depth + kwargs["max_depth"] = sim_sensors_cfg.depth_sensor.max_depth + kwargs["camera_fov"] = sim_sensors_cfg.depth_sensor.hfov + kwargs["image_width"] = sim_sensors_cfg.depth_sensor.width # Only bother visualizing if we're actually going to save the video kwargs["visualize"] = len(config.habitat_baselines.eval.video_option) > 0 @@ -129,8 +146,18 @@ def _cache_observations( # Habitat GPS makes west negative, so flip y camera_position = np.array([x, -y, self._camera_height]) tf_camera_to_episodic = xyz_yaw_to_tf_matrix(camera_position, camera_yaw) + + self._obstacle_map: ObstacleMap if self._compute_frontiers: - self._obstacle_map.update_map(depth, tf_camera_to_episodic) + self._obstacle_map.update_map( + depth, + tf_camera_to_episodic, + self._min_depth, + self._max_depth, + self._fx, + self._fy, + self._camera_fov, + ) frontiers = self._obstacle_map.frontiers else: if "frontier_sensor" in observations: @@ -143,8 +170,27 @@ def _cache_observations( "nav_depth": observations["depth"], # for pointnav "robot_xy": camera_position[:2], "robot_heading": camera_yaw, - "object_map_rgbd": [(rgb, depth, tf_camera_to_episodic)], - "value_map_rgbd": [(rgb, depth, tf_camera_to_episodic)], + "object_map_rgbd": [ + ( + rgb, + depth, + tf_camera_to_episodic, + self._min_depth, + self._max_depth, + self._fx, + self._fy, + ) + ], + "value_map_rgbd": [ + ( + rgb, + depth, + tf_camera_to_episodic, + self._min_depth, + self._max_depth, + self._camera_fov, + ) + ], } diff --git a/zsos/policy/itm_policy.py b/zsos/policy/itm_policy.py index f1104cc..4b91759 100644 --- a/zsos/policy/itm_policy.py +++ b/zsos/policy/itm_policy.py @@ -37,9 +37,6 @@ def _vis_reduce_fn(i): def __init__( self, text_prompt: str, - value_map_min_depth: float, - value_map_max_depth: float, - value_map_hfov: float, use_max_confidence: bool = True, *args, **kwargs, @@ -48,9 +45,6 @@ def __init__( self._itm = BLIP2ITMClient() self._text_prompt = text_prompt self._value_map: ValueMap = ValueMap( - fov=value_map_hfov, - min_depth=value_map_min_depth, - max_depth=value_map_max_depth, value_channels=len(text_prompt.split("\n")), use_max_confidence=use_max_confidence, ) @@ -171,7 +165,7 @@ def _get_policy_info(self, detections: ObjectDetections) -> Dict[str, Any]: return policy_info def _update_value_map(self): - all_rgb = [rgb for rgb, _, _ in self._observations_cache["value_map_rgbd"]] + all_rgb = [i[0] for i in self._observations_cache["value_map_rgbd"]] cosines = [ [ self._itm.cosine(rgb, p.replace("target_object", self._target_object)) @@ -179,10 +173,12 @@ def _update_value_map(self): ] for rgb in all_rgb ] - for cosine, (rgb, depth, tf) in zip( + for cosine, (rgb, depth, tf, min_depth, max_depth, fov) in zip( cosines, self._observations_cache["value_map_rgbd"] ): - self._value_map.update_map(depth, tf, np.array(cosine)) + self._value_map.update_map( + np.array(cosine), depth, tf, min_depth, max_depth, fov + ) def _sort_frontiers_by_value( self, observations: "TensorDict", frontiers: np.ndarray