From 75a4d3a361fb994eedcede17711cab071774ddde Mon Sep 17 00:00:00 2001 From: Bernadette Bucher <131724239+bernadettekb@users.noreply.github.com> Date: Tue, 19 Dec 2023 12:29:49 -0500 Subject: [PATCH] Compatible version with bdai merge including HW experiment code (#19) Co-authored-by: Katrina Ashton --- .pre-commit-config.yaml | 91 +++++++++++++------ config/experiments/reality.yaml | 4 +- pyproject.toml | 26 +++--- scripts/parse_jsons.py | 26 ++---- test/test_setup.py | 2 +- test/test_visualization.py | 2 +- vlfm/mapping/base_map.py | 17 +--- vlfm/mapping/frontier_map.py | 13 +-- vlfm/mapping/object_point_cloud_map.py | 41 ++------- vlfm/mapping/obstacle_map.py | 20 +--- vlfm/mapping/traj_visualizer.py | 30 ++---- vlfm/mapping/value_map.py | 67 ++++---------- vlfm/measurements/traveled_stairs.py | 4 +- vlfm/obs_transformers/resize.py | 25 ++--- vlfm/obs_transformers/utils.py | 4 +- vlfm/policy/action_replay_policy.py | 4 +- vlfm/policy/base_objectnav_policy.py | 70 ++++---------- vlfm/policy/base_policy.py | 4 +- vlfm/policy/habitat_policies.py | 20 +--- vlfm/policy/itm_policy.py | 30 ++---- vlfm/policy/reality_policies.py | 83 ++++++----------- vlfm/policy/utils/acyclic_enforcer.py | 8 +- .../non_habitat_policy/nh_pointnav_policy.py | 20 +--- .../policy/utils/non_habitat_policy/resnet.py | 20 +--- .../non_habitat_policy/rnn_state_encoder.py | 25 ++--- vlfm/policy/utils/pointnav_policy.py | 35 +++---- vlfm/reality/bdsw_nav_env.py | 4 +- vlfm/reality/objectnav_env.py | 16 +--- vlfm/reality/pointnav_env.py | 16 +--- vlfm/reality/robots/base_robot.py | 4 +- vlfm/reality/robots/bdsw_robot.py | 11 +-- vlfm/reality/robots/frame_ids.py | 8 ++ vlfm/reality/run_bdsw_objnav_env.py | 8 +- vlfm/semexp_env/eval.py | 18 +--- vlfm/utils/episode_stats_logger.py | 4 +- vlfm/utils/geometry_utils.py | 34 ++----- vlfm/utils/habitat_visualizer.py | 16 +--- vlfm/utils/img_utils.py | 47 +++------- vlfm/utils/log_saver.py | 4 +- vlfm/utils/visualization.py | 4 +- vlfm/utils/vlfm_trainer.py | 45 +++------ vlfm/vlm/blip2.py | 8 +- vlfm/vlm/blip2itm.py | 16 ++-- vlfm/vlm/detections.py | 18 ++-- vlfm/vlm/fiber.py | 16 +--- vlfm/vlm/grounding_dino.py | 20 +--- vlfm/vlm/sam.py | 8 +- vlfm/vlm/yolov7.py | 16 +--- 48 files changed, 339 insertions(+), 693 deletions(-) create mode 100644 vlfm/reality/robots/frame_ids.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 13392d6..ace4a5f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,46 +1,81 @@ -# Copyright [2023] Boston Dynamics AI Institute, Inc. +# Copyright (c) 2023 Boston Dynamics AI Institute LLC. All rights reserved. repos: -- repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.0.263' - hooks: - - id: ruff - args: ['--fix', '--config', 'pyproject.toml'] +- repo: https://github.com/charliermarsh/ruff-pre-commit + # Ruff version. + rev: 'v0.1.0' + hooks: + - id: ruff + args: ['--fix', '--config', 'pyproject.toml'] # we want this to refer to `bdai/pyproject.toml` +- repo: https://github.com/psf/black + rev: 23.10.0 + hooks: + - id: black + language_version: python3.10 + args: ['--config', 'pyproject.toml'] # we want this to refer to `bdai/pyproject.toml` + verbose: true - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-yaml - args: ['--unsafe'] + args: ['--unsafe'] # details about the unsafe flag: + # https://github.com/pre-commit/pre-commit-hooks#check-yaml + # This is the solution proposed to prevent `check-yaml` from failing on custom tags: + # https://github.com/pre-commit/pre-commit-hooks/issues/701 - id: check-added-large-files args: ['--enforce-all', '--maxkb', '200'] + # For the `exclude` argument, see https://pre-commit.com/#regular-expressions + # Make sure to escape strings correctly to ensure literal matches, for example, using Python + # + # >>> print(re.escape("path/to-some/file.ext")) + # path/to\-some/file\.ext + # + # `path/to\-some/file\.ext` is the correct regex pattern to match `path/to-some/file.ext` literally. + # The inverse operation is more cumbersome: https://stackoverflow.com/a/54840284 + exclude: | + (?x)^( + docker/ros/web/static/novnc/vendor/browser\-es\-module\-loader/dist/babel\-worker\.js| + docker/ros/rootfs/usr/local/share/doro\-lxde\-wallpapers/bg.*\.jpg| + docker/ros/web/yarn\.lock| + src/modelzoo/detic/lvis_v1_train_cat_info\.json| + src/modelzoo/edge_grasp_serve/example_pc\.npy + )$ - id: check-toml - id: end-of-file-fixer - id: check-merge-conflict - id: check-executables-have-shebangs - id: check-shebang-scripts-are-executable - -- repo: https://github.com/psf/black - rev: 23.3.0 +- repo: https://github.com/ssciwr/clang-format-hook.git + rev: v12.0.1 # Use the sha / tag you want to point at hooks: - - id: black - language_version: python3.9 - args: ['--config', 'pyproject.toml'] - verbose: true - -- repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.2.0 + - id: clang-format + types_or: [c++, c, cuda] +- repo: https://github.com/cpplint/cpplint.git + rev: 1.6.1 hooks: - - id: mypy - pass_filenames: false - additional_dependencies: - - types-protobuf - - types-requests - - types-simplejson - - types-ujson - - types-PyYAML - - types-toml - - types-six + - id: cpplint + args: ['--quiet'] + exclude_types: [cuda] +#- repo: https://github.com/leoll2/copyright_notice_precommit +# rev: 0.1.1 +# hooks: +# - id: copyright-notice +# args: [--notice=copyright.txt] + +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.6.1 + hooks: + - id: mypy + pass_filenames: false + additional_dependencies: + - types-protobuf + - types-requests + - types-simplejson + - types-ujson + - types-PyYAML + - types-toml + - types-six - repo: https://github.com/jumanjihouse/pre-commit-hooks rev: 3.0.0 diff --git a/config/experiments/reality.yaml b/config/experiments/reality.yaml index 632851f..4c213e0 100644 --- a/config/experiments/reality.yaml +++ b/config/experiments/reality.yaml @@ -10,7 +10,7 @@ policy: name: "RealityITMPolicyV2" pointnav_policy_path: "data/pointnav_weights.pth" depth_image_shape: [212, 240] # height, width - pointnav_stop_radius: 1.2 + pointnav_stop_radius: 0.9 use_max_confidence: False object_map_erosion_size: 5 exploration_thresh: 0.0 @@ -21,7 +21,7 @@ policy: agent_radius: 0.2 env: - max_body_cam_depth: 3.5 + max_body_cam_depth: 2.5 max_gripper_cam_depth: 5.0 max_lin_dist: 0.2 max_ang_dist: 0.523599 diff --git a/pyproject.toml b/pyproject.toml index 1074931..3acc9bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,14 +12,14 @@ authors = [ {name = "Naoki Yokoyama", email = "nyokoyama@theaiinstitute.com"}, ] readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" dependencies = [ "torch >= 1.10.1", "numpy >= 1.22.4", "flask >= 2.3.2", "seaborn >= 0.12.2", # required by yolov7 "open3d >= 0.17.0", - "transformers == 4.26.0", # higher versions break BLIP-2 + "transformers >= 4.28.1", # higher versions than 4.26.0 "break" BLIP-2 but need 4.28.1 for integration "salesforce-lavis >= 1.0.2", # for BLIP-2 "frontier_exploration @ git+https://github.com/naokiyokoyama/frontier_exploration.git", "mobile_sam @ git+https://github.com/ChaoningZhang/MobileSAM.git", @@ -49,8 +49,8 @@ reality = [ "Homepage" = "theaiinstitute.com" "GitHub" = "https://github.com/bdaiinstitute/vlfm" -[tool.setuptools] -packages = ["vlfm", "config"] +[tool.setuptools.packages.find] +where = ["vlfm"] [tool.ruff] # Enable pycodestyle (`E`), Pyflakes (`F`), and import sorting (`I`) @@ -92,8 +92,8 @@ line-length = 120 # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -# Assume Python 3.9. -target-version = "py39" +# Assume Python 3.10 +target-version = "py310" [tool.ruff.per-file-ignores] "__init__.py" = ["F401"] @@ -103,8 +103,8 @@ target-version = "py39" max-complexity = 10 [tool.black] -line-length = 88 -target-version = ['py39'] +line-length = 120 +target-version = ['py310'] include = '\.pyi?$' # `extend-exclude` is not honored when `black` is passed a file path explicitly, # as is typical when `black` is invoked via `pre-commit`. @@ -116,9 +116,13 @@ force-exclude = ''' preview = true +[tool.coverage.run] +relative_files = true + + # mypy configuration [tool.mypy] -python_version = "3.9" +python_version = "3.10" disallow_untyped_defs = true ignore_missing_imports = true explicit_package_bases = true @@ -127,5 +131,5 @@ strict_equality = true warn_unreachable = true warn_redundant_casts = true no_implicit_optional = true -files = ['vlfm'] -exclude = '^(docker|.*external|.*thirdparty|.*install|.*build|.*_experimental)/' +files = ['vlfm','test','scripts'] +exclude = "^(docker|.*external|.*thirdparty|.*install|.*build|.*_experimental|.*_pb2.py|.*_pb2_grpc.py)" diff --git a/scripts/parse_jsons.py b/scripts/parse_jsons.py index 949d2d9..8d31c71 100644 --- a/scripts/parse_jsons.py +++ b/scripts/parse_jsons.py @@ -46,9 +46,7 @@ def calculate_frequencies(failure_causes: List[str]) -> None: for cause, count in counter.most_common(): percentage = (count / total) * 100 # Add each row to the table - table.add_row( - [cause.replace("did_not_fail", "succeeded!"), count, f"{percentage:.2f}%"] - ) + table.add_row([cause.replace("did_not_fail", "succeeded!"), count, f"{percentage:.2f}%"]) print(table) @@ -60,10 +58,7 @@ def calculate_avg_performance(stats: List[Dict[str, Any]]) -> None: Args: stats (List[Dict[str, Any]]): A list of stats for each episode. """ - success, spl, soft_spl = [ - [episode.get(k, -1) for episode in stats] - for k in ["success", "spl", "soft_spl"] - ] + success, spl, soft_spl = [[episode.get(k, -1) for episode in stats] for k in ["success", "spl", "soft_spl"]] # Create a table with headers table = PrettyTable(["Metric", "Average"]) @@ -101,28 +96,23 @@ def calculate_avg_fail_per_category(stats: List[Dict[str, Any]]) -> None: table = PrettyTable(["Category", "Average Failure Rate"]) # Add each row to the table - for category, stats in sorted( + for category, c_stats in sorted( category_stats.items(), key=lambda x: x[1]["fail_count"], reverse=True, ): - avg_failure_rate = (stats["fail_count"] / stats["total_count"]) * 100 + avg_failure_rate = (c_stats["fail_count"] / c_stats["total_count"]) * 100 table.add_row( [ category, - ( - f"{avg_failure_rate:.2f}% ({stats['fail_count']}/" - f"{stats['total_count']})" - ), + f"{avg_failure_rate:.2f}% ({c_stats['fail_count']}/{c_stats['total_count']})", ] ) print(table) -def calculate_avg_fail_rate_per_category( - stats: List[Dict[str, Any]], failure_cause: str -) -> None: +def calculate_avg_fail_rate_per_category(stats: List[Dict[str, Any]], failure_cause: str) -> None: """ For each possible "target_object", count the number of times the agent failed due to the given failure cause. Then, sum the counts across all categories and use it to @@ -147,9 +137,7 @@ def calculate_avg_fail_rate_per_category( table = PrettyTable(["Category", f"% Occurrence for {failure_cause}"]) # Sort the categories by their failure count in descending order - sorted_categories = sorted( - category_to_fail_count.items(), key=lambda x: x[1], reverse=True - ) + sorted_categories = sorted(category_to_fail_count.items(), key=lambda x: x[1], reverse=True) # Add each row to the table for category, count in sorted_categories: diff --git a/test/test_setup.py b/test/test_setup.py index 3847d66..c4126d2 100644 --- a/test/test_setup.py +++ b/test/test_setup.py @@ -7,7 +7,7 @@ from vlfm.utils.generate_dummy_policy import save_dummy_policy -def test_load_and_save_config(): +def test_load_and_save_config() -> None: if not os.path.exists("data"): os.makedirs("data") diff --git a/test/test_visualization.py b/test/test_visualization.py index cb0c969..e984058 100644 --- a/test/test_visualization.py +++ b/test/test_visualization.py @@ -7,7 +7,7 @@ from vlfm.utils.visualization import generate_text_image -def test_visualization(): +def test_visualization() -> None: if not os.path.exists("build"): os.makedirs("build") diff --git a/vlfm/mapping/base_map.py b/vlfm/mapping/base_map.py index 8247481..37ffe90 100644 --- a/vlfm/mapping/base_map.py +++ b/vlfm/mapping/base_map.py @@ -12,9 +12,7 @@ class BaseMap: _last_camera_yaw: float = 0.0 _map_dtype: np.dtype = np.dtype(np.float32) - def __init__( - self, size: int = 1000, pixels_per_meter: int = 20, *args: Any, **kwargs: Any - ): + def __init__(self, size: int = 1000, pixels_per_meter: int = 20, *args: Any, **kwargs: Any): """ Args: size: The size of the map in pixels. @@ -23,16 +21,12 @@ def __init__( self.size = size self._map = np.zeros((size, size), dtype=self._map_dtype) self._episode_pixel_origin = np.array([size // 2, size // 2]) - self._traj_vis = TrajectoryVisualizer( - self._episode_pixel_origin, self.pixels_per_meter - ) + self._traj_vis = TrajectoryVisualizer(self._episode_pixel_origin, self.pixels_per_meter) def reset(self) -> None: self._map.fill(0) self._camera_positions = [] - self._traj_vis = TrajectoryVisualizer( - self._episode_pixel_origin, self.pixels_per_meter - ) + self._traj_vis = TrajectoryVisualizer(self._episode_pixel_origin, self.pixels_per_meter) def update_agent_traj(self, robot_xy: np.ndarray, robot_heading: float) -> None: self._camera_positions.append(robot_xy) @@ -47,10 +41,7 @@ def _xy_to_px(self, points: np.ndarray) -> np.ndarray: Returns: The array of (x, y) pixel coordinates. """ - px = ( - np.rint(points[:, ::-1] * self.pixels_per_meter) - + self._episode_pixel_origin - ) + px = np.rint(points[:, ::-1] * self.pixels_per_meter) + self._episode_pixel_origin px[:, 0] = self._map.shape[0] - px[:, 0] return px.astype(int) diff --git a/vlfm/mapping/frontier_map.py b/vlfm/mapping/frontier_map.py index 57ebe05..ec061ec 100644 --- a/vlfm/mapping/frontier_map.py +++ b/vlfm/mapping/frontier_map.py @@ -22,9 +22,7 @@ def __init__(self, encoding_type: str = "cosine"): def reset(self) -> None: self.frontiers = [] - def update( - self, frontier_locations: List[np.ndarray], curr_image: np.ndarray, text: str - ) -> None: + def update(self, frontier_locations: List[np.ndarray], curr_image: np.ndarray, text: str) -> None: """ Takes in a list of frontier coordinates and the current image observation from the robot. Any stored frontiers that are not present in the given list are @@ -41,19 +39,14 @@ def update( self.frontiers = [ frontier for frontier in self.frontiers - if any( - np.array_equal(frontier.xyz, location) - for location in frontier_locations - ) + if any(np.array_equal(frontier.xyz, location) for location in frontier_locations) ] # Add any frontiers that are not already stored. Set their image field to the # given image. cosine = None for location in frontier_locations: - if not any( - np.array_equal(frontier.xyz, location) for frontier in self.frontiers - ): + if not any(np.array_equal(frontier.xyz, location) for frontier in self.frontiers): if cosine is None: cosine = self._encode(curr_image, text) self.frontiers.append(Frontier(location, cosine)) diff --git a/vlfm/mapping/object_point_cloud_map.py b/vlfm/mapping/object_point_cloud_map.py index eb0acaa..994f614 100644 --- a/vlfm/mapping/object_point_cloud_map.py +++ b/vlfm/mapping/object_point_cloud_map.py @@ -41,9 +41,7 @@ def update_map( fy: float, ) -> None: """Updates the object map with the latest information from the agent.""" - local_cloud = self._extract_object_cloud( - depth_img, object_mask, min_depth, max_depth, fx, fy - ) + local_cloud = self._extract_object_cloud(depth_img, object_mask, min_depth, max_depth, fx, fy) if len(local_cloud) == 0: return @@ -72,15 +70,11 @@ def update_map( return if object_name in self.clouds: - self.clouds[object_name] = np.concatenate( - (self.clouds[object_name], global_cloud), axis=0 - ) + self.clouds[object_name] = np.concatenate((self.clouds[object_name], global_cloud), axis=0) else: self.clouds[object_name] = global_cloud - def get_best_object( - self, target_class: str, curr_position: np.ndarray - ) -> np.ndarray: + def get_best_object(self, target_class: str, curr_position: np.ndarray) -> np.ndarray: target_cloud = self.get_target_cloud(target_class) closest_point_2d = self._get_closest_point(target_cloud, curr_position)[:2] @@ -96,10 +90,7 @@ def get_best_object( if delta_dist < 0.1: # closest point is only slightly different return self.last_target_coord - elif ( - delta_dist < 0.5 - and np.linalg.norm(curr_position - closest_point_2d) > 2.0 - ): + elif delta_dist < 0.5 and np.linalg.norm(curr_position - closest_point_2d) > 2.0: # closest point is a little different, but the agent is too far for # the difference to matter much return self.last_target_coord @@ -108,9 +99,7 @@ def get_best_object( return self.last_target_coord - def update_explored( - self, tf_camera_to_episodic: np.ndarray, max_depth: float, cone_fov: float - ) -> None: + def update_explored(self, tf_camera_to_episodic: np.ndarray, max_depth: float, cone_fov: float) -> None: """ This method will remove all point clouds in self.clouds that were originally detected to be out-of-range, but are now within range. This is just a heuristic @@ -140,9 +129,7 @@ def update_explored( # Detection was originally within range continue # Remove all points from self.clouds[obj] that have the same range_id - self.clouds[obj] = self.clouds[obj][ - self.clouds[obj][..., -1] != range_id - ] + self.clouds[obj] = self.clouds[obj][self.clouds[obj][..., -1] != range_id] def get_target_cloud(self, target_class: str) -> np.ndarray: target_cloud = self.clouds[target_class].copy() @@ -163,9 +150,7 @@ def _extract_object_cloud( fy: float, ) -> np.ndarray: final_mask = object_mask * 255 - final_mask = cv2.erode( # type: ignore - final_mask, None, iterations=self._erosion_size - ) + final_mask = cv2.erode(final_mask, None, iterations=self._erosion_size) # type: ignore valid_depth = depth.copy() valid_depth[valid_depth == 0] = 1 # set all holes (0) to just be far (1) @@ -177,15 +162,11 @@ def _extract_object_cloud( return cloud - def _get_closest_point( - self, cloud: np.ndarray, curr_position: np.ndarray - ) -> np.ndarray: + def _get_closest_point(self, cloud: np.ndarray, curr_position: np.ndarray) -> np.ndarray: ndim = curr_position.shape[0] if self.use_dbscan: # Return the point that is closest to curr_position, which is 2D - closest_point = cloud[ - np.argmin(np.linalg.norm(cloud[:, :ndim] - curr_position, axis=1)) - ] + closest_point = cloud[np.argmin(np.linalg.norm(cloud[:, :ndim] - curr_position, axis=1))] else: # Calculate the Euclidean distance from each point to the reference point if ndim == 2: @@ -208,9 +189,7 @@ def _get_closest_point( return closest_point -def open3d_dbscan_filtering( - points: np.ndarray, eps: float = 0.2, min_points: int = 100 -) -> np.ndarray: +def open3d_dbscan_filtering(points: np.ndarray, eps: float = 0.2, min_points: int = 100) -> np.ndarray: pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(points) diff --git a/vlfm/mapping/obstacle_map.py b/vlfm/mapping/obstacle_map.py index c840347..705a4ff 100644 --- a/vlfm/mapping/obstacle_map.py +++ b/vlfm/mapping/obstacle_map.py @@ -92,12 +92,8 @@ def update_map( scaled_depth = filled_depth * (max_depth - min_depth) + min_depth mask = scaled_depth < max_depth point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, fx, fy) - point_cloud_episodic_frame = transform_points( - tf_camera_to_episodic, point_cloud_camera_frame - ) - obstacle_cloud = filter_points_by_height( - point_cloud_episodic_frame, self._min_height, self._max_height - ) + point_cloud_episodic_frame = transform_points(tf_camera_to_episodic, point_cloud_camera_frame) + obstacle_cloud = filter_points_by_height(point_cloud_episodic_frame, self._min_height, self._max_height) # Populate topdown map with obstacle locations xy_points = obstacle_cloud[:, :2] @@ -126,9 +122,7 @@ def update_map( fov=np.rad2deg(topdown_fov), max_line_len=max_depth * self.pixels_per_meter, ) - new_explored_area = cv2.dilate( - new_explored_area, np.ones((3, 3), np.uint8), iterations=1 - ) + new_explored_area = cv2.dilate(new_explored_area, np.ones((3, 3), np.uint8), iterations=1) self.explored_area[new_explored_area > 0] = 1 self.explored_area[self._navigable_map == 0] = 0 contours, _ = cv2.findContours( @@ -140,9 +134,7 @@ def update_map( min_dist = np.inf best_idx = 0 for idx, cnt in enumerate(contours): - dist = cv2.pointPolygonTest( - cnt, tuple([int(i) for i in agent_pixel_location]), True - ) + dist = cv2.pointPolygonTest(cnt, tuple([int(i) for i in agent_pixel_location]), True) if dist >= 0: best_idx = idx break @@ -201,7 +193,5 @@ def visualize(self) -> np.ndarray: return vis_img -def filter_points_by_height( - points: np.ndarray, min_height: float, max_height: float -) -> np.ndarray: +def filter_points_by_height(points: np.ndarray, min_height: float, max_height: float) -> np.ndarray: return points[(points[:, 2] >= min_height) & (points[:, 2] <= max_height)] diff --git a/vlfm/mapping/traj_visualizer.py b/vlfm/mapping/traj_visualizer.py index 661c0cb..b7e2df7 100644 --- a/vlfm/mapping/traj_visualizer.py +++ b/vlfm/mapping/traj_visualizer.py @@ -36,9 +36,7 @@ def draw_trajectory( img = self._draw_agent(img, camera_positions[-1], camera_yaw) return img - def _draw_path( - self, img: np.ndarray, camera_positions: Union[np.ndarray, List[np.ndarray]] - ) -> np.ndarray: + def _draw_path(self, img: np.ndarray, camera_positions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: """Draws the path on the image and returns it""" if len(camera_positions) < 2: return img @@ -48,9 +46,7 @@ def _draw_path( path_mask = np.zeros(img.shape[:2], dtype=np.uint8) for i in range(self._num_drawn_points - 1, len(camera_positions) - 1): - path_mask = self._draw_line( - path_mask, camera_positions[i], camera_positions[i + 1] - ) + path_mask = self._draw_line(path_mask, camera_positions[i], camera_positions[i + 1]) img[path_mask == 255] = self.path_color @@ -59,9 +55,7 @@ def _draw_path( return img - def _draw_line( - self, img: np.ndarray, pt_a: np.ndarray, pt_b: np.ndarray - ) -> np.ndarray: + def _draw_line(self, img: np.ndarray, pt_a: np.ndarray, pt_b: np.ndarray) -> np.ndarray: """Draws a line between two points and returns it""" # Convert metric coordinates to pixel coordinates px_a = self._metric_to_pixel(pt_a) @@ -80,9 +74,7 @@ def _draw_line( return img - def _draw_agent( - self, img: np.ndarray, camera_position: np.ndarray, camera_yaw: float - ) -> np.ndarray: + def _draw_agent(self, img: np.ndarray, camera_position: np.ndarray, camera_yaw: float) -> np.ndarray: """Draws the agent on the image and returns it""" px_position = self._metric_to_pixel(camera_position) cv2.circle( @@ -93,14 +85,8 @@ def _draw_agent( -1, ) heading_end_pt = ( - int( - px_position[0] - - self.agent_line_length * self.scale_factor * np.cos(camera_yaw) - ), - int( - px_position[1] - - self.agent_line_length * self.scale_factor * np.sin(camera_yaw) - ), + int(px_position[0] - self.agent_line_length * self.scale_factor * np.cos(camera_yaw)), + int(px_position[1] - self.agent_line_length * self.scale_factor * np.sin(camera_yaw)), ) cv2.line( img, @@ -112,9 +98,7 @@ def _draw_agent( return img - def draw_circle( - self, img: np.ndarray, position: np.ndarray, **kwargs: Any - ) -> np.ndarray: + def draw_circle(self, img: np.ndarray, position: np.ndarray, **kwargs: Any) -> np.ndarray: """Draws the point as a circle on the image and returns it""" px_position = self._metric_to_pixel(position) cv2.circle(img, tuple(px_position[::-1]), **kwargs) diff --git a/vlfm/mapping/value_map.py b/vlfm/mapping/value_map.py index cfc1e90..01c5d3d 100644 --- a/vlfm/mapping/value_map.py +++ b/vlfm/mapping/value_map.py @@ -39,6 +39,7 @@ class ValueMap(BaseMap): _last_camera_yaw: float = 0.0 _min_confidence: float = 0.25 _decision_threshold: float = 0.35 + _map: np.ndarray def __init__( self, @@ -75,9 +76,7 @@ def __init__( if RECORDING: if osp.isdir(RECORDING_DIR): - warnings.warn( - f"Recording directory {RECORDING_DIR} already exists. Deleting it." - ) + warnings.warn(f"Recording directory {RECORDING_DIR} already exists. Deleting it.") shutil.rmtree(RECORDING_DIR) os.mkdir(RECORDING_DIR) # Dump all args to a file @@ -119,14 +118,11 @@ def update_map( max_depth: The maximum depth value in meters. fov: The field of view of the camera in RADIANS. """ - assert len(values) == self._value_channels, ( - "Incorrect number of values given " - f"({len(values)}). Expected {self._value_channels}." - ) + assert ( + len(values) == self._value_channels + ), f"Incorrect number of values given ({len(values)}). Expected {self._value_channels}." - curr_map = self._localize_new_data( - depth, tf_camera_to_episodic, min_depth, max_depth, fov - ) + curr_map = self._localize_new_data(depth, tf_camera_to_episodic, min_depth, max_depth, fov) # Fuse the new data with the existing data self._fuse_new_data(curr_map, values) @@ -180,9 +176,7 @@ def get_value(point: np.ndarray) -> Union[float, Tuple[float, ...]]: values = [get_value(point) for point in waypoints] if self._value_channels > 1: - assert ( - reduce_fn is not None - ), "Must provide a reduction function when using multiple value channels." + assert reduce_fn is not None, "Must provide a reduction function when using multiple value channels." values = reduce_fn(values) # Use np.argsort to get the indices of the sorted values @@ -224,9 +218,7 @@ def visualize( return map_img - def _process_local_data( - self, depth: np.ndarray, fov: float, min_depth: float, max_depth: float - ) -> np.ndarray: + def _process_local_data(self, depth: np.ndarray, fov: float, min_depth: float, max_depth: float) -> np.ndarray: """Using the FOV and depth, return the visible portion of the FOV. Args: @@ -277,16 +269,10 @@ def _process_local_data( if not os.path.exists("visualizations"): os.makedirs("visualizations") # Expand the depth_row back into a full image - depth_row_full = np.repeat( - depth_row.reshape(1, -1), depth.shape[0], axis=0 - ) + depth_row_full = np.repeat(depth_row.reshape(1, -1), depth.shape[0], axis=0) # Stack the depth images with the visible mask - depth_rgb = cv2.cvtColor( - (depth * 255).astype(np.uint8), cv2.COLOR_GRAY2RGB - ) - depth_row_full = cv2.cvtColor( - (depth_row_full * 255).astype(np.uint8), cv2.COLOR_GRAY2RGB - ) + depth_rgb = cv2.cvtColor((depth * 255).astype(np.uint8), cv2.COLOR_GRAY2RGB) + depth_row_full = cv2.cvtColor((depth_row_full * 255).astype(np.uint8), cv2.COLOR_GRAY2RGB) vis = np.flipud(vis) new_width = int(vis.shape[1] * (depth_rgb.shape[0] / vis.shape[0])) vis_resized = cv2.resize(vis, (new_width, depth_rgb.shape[0])) @@ -376,10 +362,9 @@ def _fuse_new_data(self, new_map: np.ndarray, values: np.ndarray) -> None: 0 and 1, with 1 being the most confident. values: The values attributed to the new portion of the map. """ - assert len(values) == self._value_channels, ( - "Incorrect number of values given " - f"({len(values)}). Expected {self._value_channels}." - ) + assert ( + len(values) == self._value_channels + ), f"Incorrect number of values given ({len(values)}). Expected {self._value_channels}." if self._obstacle_map is not None: # If an obstacle map is provided, we will use it to mask out the @@ -405,16 +390,12 @@ def _fuse_new_data(self, new_map: np.ndarray, values: np.ndarray) -> None: self._map[self._map > 0] = 1 new_map[new_map > 0] = 1 else: - assert ( - self._fusion_type == "default" - ), f"Unknown fusion type {self._fusion_type}" + assert self._fusion_type == "default", f"Unknown fusion type {self._fusion_type}" # Any values in the given map that are less confident than # self._decision_threshold AND less than the new_map in the existing map # will be silenced into 0s - new_map_mask = np.logical_and( - new_map < self._decision_threshold, new_map < self._map - ) + new_map_mask = np.logical_and(new_map < self._decision_threshold, new_map < self._map) new_map[new_map_mask] = 0 if self._use_max_confidence: @@ -436,16 +417,10 @@ def _fuse_new_data(self, new_map: np.ndarray, values: np.ndarray) -> None: weight_1 = self._map / confidence_denominator weight_2 = new_map / confidence_denominator - weight_1_channeled = np.repeat( - np.expand_dims(weight_1, axis=2), self._value_channels, axis=2 - ) - weight_2_channeled = np.repeat( - np.expand_dims(weight_2, axis=2), self._value_channels, axis=2 - ) + weight_1_channeled = np.repeat(np.expand_dims(weight_1, axis=2), self._value_channels, axis=2) + weight_2_channeled = np.repeat(np.expand_dims(weight_2, axis=2), self._value_channels, axis=2) - self._value_map = ( - self._value_map * weight_1_channeled + values * weight_2_channeled - ) + self._value_map = self._value_map * weight_1_channeled + values * weight_2_channeled self._map = self._map * weight_1 + new_map * weight_2 # Because confidence_denominator can have 0 values, any nans in either the @@ -454,9 +429,7 @@ def _fuse_new_data(self, new_map: np.ndarray, values: np.ndarray) -> None: self._map = np.nan_to_num(self._map) -def remap( - value: float, from_low: float, from_high: float, to_low: float, to_high: float -) -> float: +def remap(value: float, from_low: float, from_high: float, to_low: float, to_high: float) -> float: """Maps a value from one range to another. Args: diff --git a/vlfm/measurements/traveled_stairs.py b/vlfm/measurements/traveled_stairs.py index fb8fd5f..d1cef14 100644 --- a/vlfm/measurements/traveled_stairs.py +++ b/vlfm/measurements/traveled_stairs.py @@ -18,9 +18,7 @@ class TraveledStairs(Measure): cls_uuid: str = "traveled_stairs" - def __init__( - self, sim: Simulator, config: DictConfig, *args: Any, **kwargs: Any - ) -> None: + def __init__(self, sim: Simulator, config: DictConfig, *args: Any, **kwargs: Any) -> None: self._sim = sim self._config = config self._history: List[np.ndarray] = [] diff --git a/vlfm/obs_transformers/resize.py b/vlfm/obs_transformers/resize.py index 0493c2a..eb2ea5d 100644 --- a/vlfm/obs_transformers/resize.py +++ b/vlfm/obs_transformers/resize.py @@ -41,30 +41,19 @@ def __init__( self.trans_keys: Tuple[str, ...] = trans_keys self.semantic_key = semantic_key - def transform_observation_space( - self, observation_space: spaces.Dict - ) -> spaces.Dict: + def transform_observation_space(self, observation_space: spaces.Dict) -> spaces.Dict: observation_space = copy.deepcopy(observation_space) for key in observation_space.spaces: if key in self.trans_keys: # In the observation space dict, the channels are always last - h, w = get_image_height_width( - observation_space.spaces[key], channels_last=True - ) + h, w = get_image_height_width(observation_space.spaces[key], channels_last=True) if self._size == (h, w): continue - logger.info( - "Resizing observation of %s: from %s to %s" - % (key, (h, w), self._size) - ) - observation_space.spaces[key] = overwrite_gym_box_shape( - observation_space.spaces[key], self._size - ) + logger.info("Resizing observation of %s: from %s to %s" % (key, (h, w), self._size)) + observation_space.spaces[key] = overwrite_gym_box_shape(observation_space.spaces[key], self._size) return observation_space - def _transform_obs( - self, obs: torch.Tensor, interpolation_mode: str - ) -> torch.Tensor: + def _transform_obs(self, obs: torch.Tensor, interpolation_mode: str) -> torch.Tensor: return image_resize( obs, self._size, @@ -79,9 +68,7 @@ def forward(self, observations: Dict[str, torch.Tensor]) -> Dict[str, torch.Tens interpolation_mode = "area" if self.semantic_key in sensor: interpolation_mode = "nearest" - observations[sensor] = self._transform_obs( - observations[sensor], interpolation_mode - ) + observations[sensor] = self._transform_obs(observations[sensor], interpolation_mode) return observations @classmethod diff --git a/vlfm/obs_transformers/utils.py b/vlfm/obs_transformers/utils.py index c304b4c..13f759b 100644 --- a/vlfm/obs_transformers/utils.py +++ b/vlfm/obs_transformers/utils.py @@ -35,9 +35,7 @@ def image_resize( # NDHWC -> NDCHW img = img.permute(0, 1, 4, 2, 3) - img = torch.nn.functional.interpolate( - img.float(), size=size, mode=interpolation_mode - ).to(dtype=img.dtype) + img = torch.nn.functional.interpolate(img.float(), size=size, mode=interpolation_mode).to(dtype=img.dtype) if channels_last: if len(img.shape) == 4: # NCHW -> NHWC diff --git a/vlfm/policy/action_replay_policy.py b/vlfm/policy/action_replay_policy.py index 59092da..dbf1653 100644 --- a/vlfm/policy/action_replay_policy.py +++ b/vlfm/policy/action_replay_policy.py @@ -33,9 +33,7 @@ def __init__( **kwargs: Any, ) -> None: super().__init__() - assert ( - "VLFM_RECORD_ACTIONS_DIR" in os.environ - ), "Must set VLFM_RECORD_ACTIONS_DIR" + assert "VLFM_RECORD_ACTIONS_DIR" in os.environ, "Must set VLFM_RECORD_ACTIONS_DIR" self._dir = os.environ["VLFM_RECORD_ACTIONS_DIR"] filepath = os.path.join(self._dir, "actions.txt") with open(filepath, "r") as f: diff --git a/vlfm/policy/base_objectnav_policy.py b/vlfm/policy/base_objectnav_policy.py index 85a9c67..196114b 100644 --- a/vlfm/policy/base_objectnav_policy.py +++ b/vlfm/policy/base_objectnav_policy.py @@ -61,22 +61,14 @@ def __init__( **kwargs: Any, ) -> None: super().__init__() - self._object_detector = GroundingDINOClient( - port=int(os.environ.get("GROUNDING_DINO_PORT", "12181")) - ) - self._coco_object_detector = YOLOv7Client( - port=int(os.environ.get("YOLOV7_PORT", "12184")) - ) - self._mobile_sam = MobileSAMClient( - port=int(os.environ.get("SAM_PORT", "12183")) - ) + self._object_detector = GroundingDINOClient(port=int(os.environ.get("GROUNDING_DINO_PORT", "12181"))) + self._coco_object_detector = YOLOv7Client(port=int(os.environ.get("YOLOV7_PORT", "12184"))) + self._mobile_sam = MobileSAMClient(port=int(os.environ.get("SAM_PORT", "12183"))) self._use_vqa = use_vqa if use_vqa: self._vqa = BLIP2Client(port=int(os.environ.get("BLIP2_PORT", "12185"))) self._pointnav_policy = WrappedPointNavResNetPolicy(pointnav_policy_path) - self._object_map: ObjectPointCloudMap = ObjectPointCloudMap( - erosion_size=object_map_erosion_size - ) + self._object_map: ObjectPointCloudMap = ObjectPointCloudMap(erosion_size=object_map_erosion_size) self._depth_image_shape = tuple(depth_image_shape) self._pointnav_stop_radius = pointnav_stop_radius self._visualize = visualize @@ -118,7 +110,7 @@ def act( prev_actions: Any, masks: Tensor, deterministic: bool = False, - ) -> Tuple[Tensor, Tensor]: + ) -> Any: """ Starts the episode by 'initializing' and allowing robot to get its bearings (e.g., spinning in place to get a good view of the scene). @@ -176,9 +168,7 @@ def _initialize(self) -> Tensor: def _explore(self, observations: "TensorDict") -> Tensor: raise NotImplementedError - def _get_target_object_location( - self, position: np.ndarray - ) -> Union[None, np.ndarray]: + def _get_target_object_location(self, position: np.ndarray) -> Union[None, np.ndarray]: if self._object_map.has_object(self._target_object): return self._object_map.get_best_object(self._target_object, position) else: @@ -207,30 +197,20 @@ def _get_policy_info(self, detections: ObjectDetections) -> Dict[str, Any]: return policy_info annotated_depth = self._observations_cache["object_map_rgbd"][0][1] * 255 - annotated_depth = cv2.cvtColor( - annotated_depth.astype(np.uint8), cv2.COLOR_GRAY2RGB - ) + annotated_depth = cv2.cvtColor(annotated_depth.astype(np.uint8), cv2.COLOR_GRAY2RGB) if self._object_masks.sum() > 0: # If self._object_masks isn't all zero, get the object segmentations and # draw them on the rgb and depth images - contours, _ = cv2.findContours( - self._object_masks, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE - ) - annotated_rgb = cv2.drawContours( - detections.annotated_frame, contours, -1, (255, 0, 0), 2 - ) - annotated_depth = cv2.drawContours( - annotated_depth, contours, -1, (255, 0, 0), 2 - ) + contours, _ = cv2.findContours(self._object_masks, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + annotated_rgb = cv2.drawContours(detections.annotated_frame, contours, -1, (255, 0, 0), 2) + annotated_depth = cv2.drawContours(annotated_depth, contours, -1, (255, 0, 0), 2) else: annotated_rgb = self._observations_cache["object_map_rgbd"][0][0] policy_info["annotated_rgb"] = annotated_rgb policy_info["annotated_depth"] = annotated_depth if self._compute_frontiers: - policy_info["obstacle_map"] = cv2.cvtColor( - self._obstacle_map.visualize(), cv2.COLOR_BGR2RGB - ) + policy_info["obstacle_map"] = cv2.cvtColor(self._obstacle_map.visualize(), cv2.COLOR_BGR2RGB) if "DEBUG_INFO" in os.environ: policy_info["render_below_images"].append("debug") @@ -249,16 +229,12 @@ def _get_object_detections(self, img: np.ndarray) -> ObjectDetections: else self._object_detector.predict(img, caption=self._non_coco_caption) ) detections.filter_by_class(target_classes) - det_conf_threshold = ( - self._coco_threshold if has_coco else self._non_coco_threshold - ) + det_conf_threshold = self._coco_threshold if has_coco else self._non_coco_threshold detections.filter_by_conf(det_conf_threshold) if has_coco and has_non_coco and detections.num_detections == 0: # Retry with non-coco object detector - detections = self._object_detector.predict( - img, caption=self._non_coco_caption - ) + detections = self._object_detector.predict(img, caption=self._non_coco_caption) detections.filter_by_class(target_classes) detections.filter_by_conf(self._non_coco_threshold) @@ -285,9 +261,7 @@ def _pointnav(self, goal: np.ndarray, stop: bool = False) -> Tensor: robot_xy = self._observations_cache["robot_xy"] heading = self._observations_cache["robot_heading"] rho, theta = rho_theta(robot_xy, heading, goal) - rho_theta_tensor = torch.tensor( - [[rho, theta]], device="cuda", dtype=torch.float32 - ) + rho_theta_tensor = torch.tensor([[rho, theta]], device="cuda", dtype=torch.float32) obs_pointnav = { "depth": image_resize( self._observations_cache["nav_depth"], @@ -343,21 +317,15 @@ def _update_object_map( obs[1] = depth self._observations_cache["object_map_rgbd"][0] = tuple(obs) for idx in range(len(detections.logits)): - bbox_denorm = detections.boxes[idx] * np.array( - [width, height, width, height] - ) + bbox_denorm = detections.boxes[idx] * np.array([width, height, width, height]) object_mask = self._mobile_sam.segment_bbox(rgb, bbox_denorm.tolist()) # If we are using vqa, then use the BLIP2 model to visually confirm whether # the contours are actually correct. if self._use_vqa: - contours, _ = cv2.findContours( - object_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE - ) - annotated_rgb = cv2.drawContours( - rgb.copy(), contours, -1, (255, 0, 0), 2 - ) + contours, _ = cv2.findContours(object_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + annotated_rgb = cv2.drawContours(rgb.copy(), contours, -1, (255, 0, 0), 2) question = f"Question: {self._vqa_prompt}" if not detections.phrases[idx].endswith("ing"): question += "a " @@ -391,9 +359,7 @@ def _cache_observations(self, observations: "TensorDict") -> None: """ raise NotImplementedError - def _infer_depth( - self, rgb: np.ndarray, min_depth: float, max_depth: float - ) -> np.ndarray: + def _infer_depth(self, rgb: np.ndarray, min_depth: float, max_depth: float) -> np.ndarray: """Infers the depth image from the rgb image. Args: diff --git a/vlfm/policy/base_policy.py b/vlfm/policy/base_policy.py index b9c6db8..6819ab4 100644 --- a/vlfm/policy/base_policy.py +++ b/vlfm/policy/base_policy.py @@ -54,9 +54,7 @@ def parameters(self) -> Generator: # Save a dummy state_dict using torch.save. This is useful for generating a pth file # that can be used to load other policies that don't even read from checkpoints, # even though habitat requires a checkpoint to be loaded. - config = get_config( - "habitat-lab/habitat-baselines/habitat_baselines/config/pointnav/ppo_pointnav_example.yaml" - ) + config = get_config("habitat-lab/habitat-baselines/habitat_baselines/config/pointnav/ppo_pointnav_example.yaml") dummy_dict = { "config": config, "extra_state": {"step": 0}, diff --git a/vlfm/policy/habitat_policies.py b/vlfm/policy/habitat_policies.py index f35ed75..f564f2e 100644 --- a/vlfm/policy/habitat_policies.py +++ b/vlfm/policy/habitat_policies.py @@ -92,13 +92,9 @@ def __init__( self._dataset_type = dataset_type @classmethod - def from_config( - cls, config: DictConfig, *args_unused: Any, **kwargs_unused: Any - ) -> "HabitatMixin": + def from_config(cls, config: DictConfig, *args_unused: Any, **kwargs_unused: Any) -> "HabitatMixin": policy_config: VLFMPolicyConfig = config.habitat_baselines.rl.policy - kwargs = { - k: policy_config[k] for k in VLFMPolicyConfig.kwaarg_names # type: ignore - } + kwargs = {k: policy_config[k] for k in VLFMPolicyConfig.kwaarg_names} # type: ignore # In habitat, we need the height of the camera to generate the camera transform sim_sensors_cfg = config.habitat.simulator.agents.main_agent.sim_sensors @@ -137,16 +133,12 @@ def act( obs_dict[ObjectGoalSensor.cls_uuid] = HM3D_ID_TO_NAME[object_id] elif self._dataset_type == "mp3d": obs_dict[ObjectGoalSensor.cls_uuid] = MP3D_ID_TO_NAME[object_id] - self._non_coco_caption = ( - " . ".join(MP3D_ID_TO_NAME).replace("|", " . ") + " ." - ) + self._non_coco_caption = " . ".join(MP3D_ID_TO_NAME).replace("|", " . ") + " ." else: raise ValueError(f"Dataset type {self._dataset_type} not recognized") parent_cls: BaseObjectNavPolicy = super() # type: ignore try: - action, rnn_hidden_states = parent_cls.act( - obs_dict, rnn_hidden_states, prev_actions, masks, deterministic - ) + action, rnn_hidden_states = parent_cls.act(obs_dict, rnn_hidden_states, prev_actions, masks, deterministic) except StopIteration: action = self._stop_action return PolicyActionData( @@ -178,9 +170,7 @@ def _get_policy_info(self, detections: ObjectDetections) -> Dict[str, Any]: info["start_yaw"] = self._start_yaw return info - def _cache_observations( - self: Union["HabitatMixin", BaseObjectNavPolicy], observations: TensorDict - ) -> None: + def _cache_observations(self: Union["HabitatMixin", BaseObjectNavPolicy], observations: TensorDict) -> None: """Caches the rgb, depth, and camera transform from the observations. Args: diff --git a/vlfm/policy/itm_policy.py b/vlfm/policy/itm_policy.py index c5bd559..878a124 100644 --- a/vlfm/policy/itm_policy.py +++ b/vlfm/policy/itm_policy.py @@ -89,9 +89,7 @@ def _get_best_frontier( Tuple[np.ndarray, float]: The best frontier and its value. """ # The points and values will be sorted in descending order - sorted_pts, sorted_values = self._sort_frontiers_by_value( - observations, frontiers - ) + sorted_pts, sorted_values = self._sort_frontiers_by_value(observations, frontiers) robot_xy = self._observations_cache["robot_xy"] best_frontier_idx = None top_two_values = tuple(sorted_values[:2]) @@ -110,9 +108,7 @@ def _get_best_frontier( break if curr_index is None: - closest_index = closest_point_within_threshold( - sorted_pts, self._last_frontier, threshold=0.5 - ) + closest_index = closest_point_within_threshold(sorted_pts, self._last_frontier, threshold=0.5) if closest_index != -1: # There is a point close to the last point pursued @@ -131,9 +127,7 @@ def _get_best_frontier( # it is not cyclic. if best_frontier_idx is None: for idx, frontier in enumerate(sorted_pts): - cyclic = self._acyclic_enforcer.check_cyclic( - robot_xy, frontier, top_two_values - ) + cyclic = self._acyclic_enforcer.check_cyclic(robot_xy, frontier, top_two_values) if cyclic: print("Suppressed cyclic frontier.") continue @@ -209,9 +203,7 @@ def _update_value_map(self) -> None: for cosine, (rgb, depth, tf, min_depth, max_depth, fov) in zip( cosines, self._observations_cache["value_map_rgbd"] ): - self._value_map.update_map( - np.array(cosine), depth, tf, min_depth, max_depth, fov - ) + self._value_map.update_map(np.array(cosine), depth, tf, min_depth, max_depth, fov) self._value_map.update_agent_traj( self._observations_cache["robot_xy"], @@ -240,9 +232,7 @@ def act( self._pre_step(observations, masks) if self._visualize: self._update_value_map() - return super().act( - observations, rnn_hidden_states, prev_actions, masks, deterministic - ) + return super().act(observations, rnn_hidden_states, prev_actions, masks, deterministic) def _reset(self) -> None: super()._reset() @@ -265,12 +255,10 @@ def act( prev_actions: Any, masks: Tensor, deterministic: bool = False, - ) -> Tuple[Tensor, Tensor]: + ) -> Any: self._pre_step(observations, masks) self._update_value_map() - return super().act( - observations, rnn_hidden_states, prev_actions, masks, deterministic - ) + return super().act(observations, rnn_hidden_states, prev_actions, masks, deterministic) def _sort_frontiers_by_value( self, observations: "TensorDict", frontiers: np.ndarray @@ -301,9 +289,7 @@ def visualize_value_map(arr: np.ndarray) -> np.ndarray: def _sort_frontiers_by_value( self, observations: "TensorDict", frontiers: np.ndarray ) -> Tuple[np.ndarray, List[float]]: - sorted_frontiers, sorted_values = self._value_map.sort_waypoints( - frontiers, 0.5, reduce_fn=self._reduce_values - ) + sorted_frontiers, sorted_values = self._value_map.sort_waypoints(frontiers, 0.5, reduce_fn=self._reduce_values) return sorted_frontiers, sorted_values diff --git a/vlfm/policy/reality_policies.py b/vlfm/policy/reality_policies.py index 6711dff..fe512f4 100644 --- a/vlfm/policy/reality_policies.py +++ b/vlfm/policy/reality_policies.py @@ -1,7 +1,7 @@ # Copyright (c) 2023 Boston Dynamics AI Institute LLC. All rights reserved. from dataclasses import dataclass -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Dict, List, Union import numpy as np import torch @@ -35,19 +35,15 @@ class RealityMixin: _policy_info: Dict[str, Any] = {} _done_initializing: bool = False - def __init__( - self: Union["RealityMixin", ITMPolicyV2], *args: Any, **kwargs: Any - ) -> None: + def __init__(self: Union["RealityMixin", ITMPolicyV2], *args: Any, **kwargs: Any) -> None: super().__init__(sync_explored_areas=True, *args, **kwargs) # type: ignore - self._depth_model = torch.hub.load( - "isl-org/ZoeDepth", "ZoeD_NK", config_mode="eval", pretrained=True - ).to("cuda" if torch.cuda.is_available() else "cpu") + self._depth_model = torch.hub.load("isl-org/ZoeDepth", "ZoeD_NK", config_mode="eval", pretrained=True).to( + "cuda" if torch.cuda.is_available() else "cpu" + ) self._object_map.use_dbscan = False # type: ignore @classmethod - def from_config( - cls, config: DictConfig, *args_unused: Any, **kwargs_unused: Any - ) -> Any: + def from_config(cls, config: DictConfig, *args_unused: Any, **kwargs_unused: Any) -> Any: policy_config: VLFMConfig = config.policy kwargs = {k: policy_config[k] for k in VLFMConfig.kwaarg_names} # type: ignore @@ -60,52 +56,41 @@ def act( prev_actions: Any, masks: Tensor, deterministic: bool = False, - ) -> Tuple[Tensor, Tensor]: + ) -> Dict[str, Any]: if observations["objectgoal"] not in self._non_coco_caption: - self._non_coco_caption = ( - observations["objectgoal"] + " . " + self._non_coco_caption - ) + self._non_coco_caption = observations["objectgoal"] + " . " + self._non_coco_caption parent_cls: ITMPolicyV2 = super() # type: ignore - action: Tensor = parent_cls.act( - observations, rnn_hidden_states, prev_actions, masks, deterministic - )[0] + action: Tensor = parent_cls.act(observations, rnn_hidden_states, prev_actions, masks, deterministic)[0] # The output of the policy is a (1, 2) tensor of floats, where the first element # is the linear velocity and the second element is the angular velocity. We # convert this numpy array to a dictionary with keys "angular" and "linear" so # that it can be passed to the Spot robot. if self._done_initializing: - angular = action[0][0].item() - linear = action[0][1].item() - arm_yaw = -1.0 + action_dict = { + "angular": action[0][0].item(), + "linear": action[0][1].item(), + "arm_yaw": -1, + "info": self._policy_info, + } else: - angular = 0.0 - linear = 0.0 - arm_yaw = action[0][0].item() - - self._done_initializing = len(self._initial_yaws) == 0 - - action = torch.tensor([[angular, linear, arm_yaw]], dtype=torch.float32) - - return action, rnn_hidden_states + action_dict = { + "angular": 0, + "linear": 0, + "arm_yaw": action[0][0].item(), + "info": self._policy_info, + } - def get_action( - self, observations: Dict[str, Any], masks: Tensor, deterministic: bool = True - ) -> Dict[str, Any]: - actions, _ = self.act( - observations, None, None, masks, deterministic=deterministic - ) - action_dict = { - "angular": actions[0], - "linear": actions[1], - "arm_yaw": actions[2], - "info": self._policy_info, - } if "rho_theta" in self._policy_info: action_dict["rho_theta"] = self._policy_info["rho_theta"] + self._done_initializing = len(self._initial_yaws) == 0 + return action_dict + def get_action(self, observations: Dict[str, Any], masks: Tensor, deterministic: bool = True) -> Dict[str, Any]: + return self.act(observations, None, None, masks, deterministic=deterministic) + def _reset(self: Union["RealityMixin", ITMPolicyV2]) -> None: parent_cls: ITMPolicyV2 = super() # type: ignore parent_cls._reset() @@ -116,9 +101,7 @@ def _initialize(self) -> Tensor: yaw = self._initial_yaws.pop(0) return torch.tensor([[yaw]], dtype=torch.float32) - def _cache_observations( - self: Union["RealityMixin", ITMPolicyV2], observations: Dict[str, Any] - ) -> None: + def _cache_observations(self: Union["RealityMixin", ITMPolicyV2], observations: Dict[str, Any]) -> None: """Caches the rgb, depth, and camera transform from the observations. Args: @@ -141,9 +124,7 @@ def _cache_observations( explore=False, ) - _, tf, min_depth, max_depth, fx, fy, topdown_fov = observations[ - "obstacle_map_depths" - ][-1] + _, tf, min_depth, max_depth, fx, fy, topdown_fov = observations["obstacle_map_depths"][-1] self._obstacle_map.update_map( None, tf, @@ -156,9 +137,7 @@ def _cache_observations( update_obstacles=False, ) - self._obstacle_map.update_agent_traj( - observations["robot_xy"], observations["robot_heading"] - ) + self._obstacle_map.update_agent_traj(observations["robot_xy"], observations["robot_heading"]) frontiers = self._obstacle_map.frontiers height, width = observations["nav_depth"].shape @@ -174,9 +153,7 @@ def _cache_observations( "value_map_rgbd": observations["value_map_rgbd"], } - def _infer_depth( - self, rgb: np.ndarray, min_depth: float, max_depth: float - ) -> np.ndarray: + def _infer_depth(self, rgb: np.ndarray, min_depth: float, max_depth: float) -> np.ndarray: """Infers the depth image from the rgb image. Args: diff --git a/vlfm/policy/utils/acyclic_enforcer.py b/vlfm/policy/utils/acyclic_enforcer.py index be85bcb..d6762b1 100644 --- a/vlfm/policy/utils/acyclic_enforcer.py +++ b/vlfm/policy/utils/acyclic_enforcer.py @@ -19,15 +19,11 @@ def __hash__(self) -> int: class AcyclicEnforcer: history: Set[StateAction] = set() - def check_cyclic( - self, position: np.ndarray, action: Any, other: Any = None - ) -> bool: + def check_cyclic(self, position: np.ndarray, action: Any, other: Any = None) -> bool: state_action = StateAction(position, action, other) cyclic = state_action in self.history return cyclic - def add_state_action( - self, position: np.ndarray, action: Any, other: Any = None - ) -> None: + def add_state_action(self, position: np.ndarray, action: Any, other: Any = None) -> None: state_action = StateAction(position, action, other) self.history.add(state_action) diff --git a/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py b/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py index 6373e96..cae0612 100644 --- a/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py +++ b/vlfm/policy/utils/non_habitat_policy/nh_pointnav_policy.py @@ -19,16 +19,12 @@ def __init__(self) -> None: self.running_mean_and_var = nn.Sequential() self.backbone = resnet18(1, 32, 16) self.compression = nn.Sequential( - nn.Conv2d( - 256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False - ), + nn.Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False), nn.GroupNorm(1, 128, eps=1e-05, affine=True), nn.ReLU(inplace=True), ) - def forward( - self, observations: Dict[str, torch.Tensor] - ) -> torch.Tensor: # type: ignore + def forward(self, observations: Dict[str, torch.Tensor]) -> torch.Tensor: # type: ignore cnn_input = [] for k in self.visual_keys: obs_k = observations[k] @@ -51,9 +47,7 @@ def __init__(self, discrete_actions: bool = False, no_fwd_dict: bool = False): if discrete_actions: self.prev_action_embedding_discrete = nn.Embedding(4 + 1, 32) else: - self.prev_action_embedding_cont = nn.Linear( - in_features=2, out_features=32, bias=True - ) + self.prev_action_embedding_cont = nn.Linear(in_features=2, out_features=32, bias=True) self.tgt_embeding = nn.Linear(in_features=3, out_features=32, bias=True) self.visual_encoder = ResNetEncoder() self.visual_fc = nn.Sequential( @@ -104,9 +98,7 @@ def forward( x.append(prev_actions) out = torch.cat(x, dim=1) - out, rnn_hidden_states = self.state_encoder( - out, rnn_hidden_states, masks, rnn_build_seq_info - ) + out, rnn_hidden_states = self.state_encoder(out, rnn_hidden_states, masks, rnn_build_seq_info) if self.no_fwd_dict: return out, rnn_hidden_states # type: ignore @@ -159,9 +151,7 @@ def act( masks: torch.Tensor, deterministic: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor]: - features, rnn_hidden_states, _ = self.net( - observations, rnn_hidden_states, prev_actions, masks - ) + features, rnn_hidden_states, _ = self.net(observations, rnn_hidden_states, prev_actions, masks) distribution = self.action_distribution(features) if deterministic: diff --git a/vlfm/policy/utils/non_habitat_policy/resnet.py b/vlfm/policy/utils/non_habitat_policy/resnet.py index e1a18c2..9767db8 100755 --- a/vlfm/policy/utils/non_habitat_policy/resnet.py +++ b/vlfm/policy/utils/non_habitat_policy/resnet.py @@ -48,9 +48,7 @@ def forward(self, x: Tensor) -> Tensor: return self.relu(out + residual) -def conv3x3( - in_planes: int, out_planes: int, stride: int = 1, groups: int = 1 -) -> Conv2d: +def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1) -> Conv2d: """3x3 convolution with padding""" return nn.Conv2d( in_planes, @@ -98,18 +96,10 @@ def __init__( if block.resneXt: base_planes *= 2 - self.layer1 = self._make_layer( - block, ngroups, base_planes, layers[0] # type: ignore - ) - self.layer2 = self._make_layer( - block, ngroups, base_planes * 2, layers[1], stride=2 # type: ignore - ) - self.layer3 = self._make_layer( - block, ngroups, base_planes * 2 * 2, layers[2], stride=2 # type: ignore - ) - self.layer4 = self._make_layer( - block, ngroups, base_planes * 2 * 2 * 2, layers[3], stride=2 # type: ignore - ) + self.layer1 = self._make_layer(block, ngroups, base_planes, layers[0]) # type: ignore + self.layer2 = self._make_layer(block, ngroups, base_planes * 2, layers[1], stride=2) # type: ignore + self.layer3 = self._make_layer(block, ngroups, base_planes * 2 * 2, layers[2], stride=2) # type: ignore + self.layer4 = self._make_layer(block, ngroups, base_planes * 2 * 2 * 2, layers[3], stride=2) # type: ignore self.final_channels = self.inplanes self.final_spatial_compress = 1.0 / (2**5) diff --git a/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py b/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py index 66e121f..e518b2b 100755 --- a/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py +++ b/vlfm/policy/utils/non_habitat_policy/rnn_state_encoder.py @@ -57,9 +57,7 @@ def single_forward( ) -> Tuple[torch.Tensor, torch.Tensor]: r"""Forward for a non-sequence input""" - hidden_states = torch.where( - masks.view(1, -1, 1), hidden_states, hidden_states.new_zeros(()) - ) + hidden_states = torch.where(masks.view(1, -1, 1), hidden_states, hidden_states.new_zeros(())) x, hidden_states = self.rnn(x.unsqueeze(0), self.unpack_hidden(hidden_states)) hidden_states = self.pack_hidden(hidden_states) @@ -114,9 +112,7 @@ def forward( x, hidden_states = self.single_forward(x, hidden_states, masks) else: assert rnn_build_seq_info is not None - x, hidden_states = self.seq_forward( - x, hidden_states, masks, rnn_build_seq_info - ) + x, hidden_states = self.seq_forward(x, hidden_states, masks, rnn_build_seq_info) hidden_states = hidden_states.permute(1, 0, 2) @@ -134,14 +130,10 @@ def __init__( # Note: Type handling mypy errors in pytorch libraries prevent # directly setting hidden_states type - def pack_hidden( - self, hidden_states: Any # type is Tuple[torch.Tensor, torch.Tensor] - ) -> torch.Tensor: + def pack_hidden(self, hidden_states: Any) -> torch.Tensor: # type is Tuple[torch.Tensor, torch.Tensor] return torch.cat(hidden_states, 0) - def unpack_hidden( - self, hidden_states: torch.Tensor - ) -> Any: # type is Tuple[torch.Tensor, torch.Tensor] + def unpack_hidden(self, hidden_states: torch.Tensor) -> Any: # type is Tuple[torch.Tensor, torch.Tensor] lstm_states = torch.chunk(hidden_states.contiguous(), 2, 0) return (lstm_states[0], lstm_states[1]) @@ -151,7 +143,10 @@ def build_rnn_inputs( rnn_states: torch.Tensor, not_dones: torch.Tensor, rnn_build_seq_info: Dict[str, torch.Tensor], -) -> Tuple[PackedSequence, torch.Tensor,]: +) -> Tuple[ + PackedSequence, + torch.Tensor, +]: r"""Create a PackedSequence input for an RNN such that each set of steps that are part of the same episode are all part of a batch in the PackedSequence. @@ -223,9 +218,7 @@ def build_rnn_out_from_seq( rnn_state_batch_inds = rnn_build_seq_info["rnn_state_batch_inds"] output_hidden_states = hidden_states.index_select( 1, - last_sequence_in_batch_inds[ - _invert_permutation(rnn_state_batch_inds[last_sequence_in_batch_inds]) - ], + last_sequence_in_batch_inds[_invert_permutation(rnn_state_batch_inds[last_sequence_in_batch_inds])], ) return x, output_hidden_states diff --git a/vlfm/policy/utils/pointnav_policy.py b/vlfm/policy/utils/pointnav_policy.py index 002de45..517d067 100644 --- a/vlfm/policy/utils/pointnav_policy.py +++ b/vlfm/policy/utils/pointnav_policy.py @@ -22,9 +22,7 @@ class PointNavResNetTensorOutputPolicy(PointNavResNetPolicy): def act(self, *args: Any, **kwargs: Any) -> Tuple[Tensor, Tensor]: - value, action, action_log_probs, rnn_hidden_states = super().act( - *args, **kwargs - ) + value, action, action_log_probs, rnn_hidden_states = super().act(*args, **kwargs) return action, rnn_hidden_states else: @@ -126,9 +124,7 @@ def reset(self) -> None: """ Resets the hidden state and previous action for the policy. """ - self.pointnav_test_recurrent_hidden_states = torch.zeros_like( - self.pointnav_test_recurrent_hidden_states - ) + self.pointnav_test_recurrent_hidden_states = torch.zeros_like(self.pointnav_test_recurrent_hidden_states) self.pointnav_prev_actions = torch.zeros_like(self.pointnav_prev_actions) @@ -143,9 +139,7 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: if HABITAT_BASELINES_AVAILABLE: obs_space = SpaceDict( { - "depth": spaces.Box( - low=0.0, high=1.0, shape=(224, 224, 1), dtype=np.float32 - ), + "depth": spaces.Box(low=0.0, high=1.0, shape=(224, 224, 1), dtype=np.float32), "pointgoal_with_gps_compass": spaces.Box( low=np.finfo(np.float32).min, high=np.finfo(np.float32).max, @@ -174,15 +168,11 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: ) # print(pointnav_policy) - pointnav_policy.net = PointNavResNetNet( - discrete_actions=True, no_fwd_dict=True - ) + pointnav_policy.net = PointNavResNetNet(discrete_actions=True, no_fwd_dict=True) state_dict = torch.load(file_path + ".state_dict", map_location="cpu") else: ckpt_dict = torch.load(file_path, map_location="cpu") - pointnav_policy = PointNavResNetTensorOutputPolicy.from_config( - ckpt_dict["config"], obs_space, action_space - ) + pointnav_policy = PointNavResNetTensorOutputPolicy.from_config(ckpt_dict["config"], obs_space, action_space) state_dict = ckpt_dict["state_dict"] pointnav_policy.load_state_dict(state_dict) return pointnav_policy @@ -191,14 +181,15 @@ def load_pointnav_policy(file_path: str) -> PointNavResNetTensorOutputPolicy: ckpt_dict = torch.load(file_path, map_location="cpu") pointnav_policy = PointNavResNetTensorOutputPolicy() current_state_dict = pointnav_policy.state_dict() - pointnav_policy.load_state_dict( - {k: v for k, v in ckpt_dict.items() if k in current_state_dict} - ) + # Let old checkpoints work with new code + if "net.prev_action_embedding_cont.bias" not in ckpt_dict.keys(): + ckpt_dict["net.prev_action_embedding_cont.bias"] = ckpt_dict["net.prev_action_embedding.bias"] + if "net.prev_action_embedding_cont.weights" not in ckpt_dict.keys(): + ckpt_dict["net.prev_action_embedding_cont.weight"] = ckpt_dict["net.prev_action_embedding.weight"] + + pointnav_policy.load_state_dict({k: v for k, v in ckpt_dict.items() if k in current_state_dict}) unused_keys = [k for k in ckpt_dict.keys() if k not in current_state_dict] - print( - "The following unused keys were not loaded when loading the pointnav" - f" policy: {unused_keys}" - ) + print(f"The following unused keys were not loaded when loading the pointnav policy: {unused_keys}") return pointnav_policy diff --git a/vlfm/reality/bdsw_nav_env.py b/vlfm/reality/bdsw_nav_env.py index 93cd6d5..81b8868 100644 --- a/vlfm/reality/bdsw_nav_env.py +++ b/vlfm/reality/bdsw_nav_env.py @@ -9,9 +9,7 @@ from vlfm.reality.robots.bdsw_robot import BDSWRobot -def run_env( - env: PointNavEnv, policy: WrappedPointNavResNetPolicy, goal: np.ndarray -) -> None: +def run_env(env: PointNavEnv, policy: WrappedPointNavResNetPolicy, goal: np.ndarray) -> None: observations = env.reset(goal) done = False mask = torch.zeros(1, 1, device=policy.device, dtype=torch.bool) diff --git a/vlfm/reality/objectnav_env.py b/vlfm/reality/objectnav_env.py index 78b9cfa..34bfcfb 100644 --- a/vlfm/reality/objectnav_env.py +++ b/vlfm/reality/objectnav_env.py @@ -103,9 +103,7 @@ def step(self, action: Dict[str, Any]) -> Tuple[Dict, float, bool, Dict]: return super().step(action) if action["arm_yaw"] == 0: - cmd_id = self.robot.spot.move_gripper_to_point( - np.array([0.35, 0.0, 0.3]), np.deg2rad([0.0, 0.0, 0.0]) - ) + cmd_id = self.robot.spot.move_gripper_to_point(np.array([0.35, 0.0, 0.3]), np.deg2rad([0.0, 0.0, 0.0])) self.robot.spot.block_until_arm_arrives(cmd_id, timeout_sec=1.5) else: new_pose = np.array(NOMINAL_ARM_POSE) @@ -119,9 +117,7 @@ def step(self, action: Dict[str, Any]) -> Tuple[Dict, float, bool, Dict]: def _get_obs(self) -> Dict[str, Any]: robot_xy, robot_heading = self._get_gps(), self._get_compass() - nav_depth, obstacle_map_depths, value_map_rgbd, object_map_rgbd = ( - self._get_camera_obs() - ) + nav_depth, obstacle_map_depths, value_map_rgbd, object_map_rgbd = self._get_camera_obs() return { "nav_depth": nav_depth, "robot_xy": robot_xy, @@ -142,9 +138,7 @@ def _get_camera_obs(self) -> Tuple[np.ndarray, List, List, List]: for src in ALL_CAMS: tf = self.tf_global_to_episodic @ cam_data[src]["tf_camera_to_global"] # a tf that remaps from camera conventions to xyz conventions - rotation_matrix = np.array( - [[0, -1, 0, 0], [0, 0, -1, 0], [1, 0, 0, 0], [0, 0, 0, 1]] - ) + rotation_matrix = np.array([[0, -1, 0, 0], [0, 0, -1, 0], [1, 0, 0, 0], [0, 0, 0, 1]]) cam_data[src]["tf_camera_to_global"] = np.dot(tf, rotation_matrix) img = cam_data[src]["image"] @@ -183,9 +177,7 @@ def _get_camera_obs(self) -> Tuple[np.ndarray, List, List, List]: # to be upright f_left = SpotCamIds.FRONTLEFT_DEPTH f_right = SpotCamIds.FRONTRIGHT_DEPTH - nav_cam_data = self.robot.reorient_images( - {k: cam_data[k]["image"] for k in [f_left, f_right]} - ) + nav_cam_data = self.robot.reorient_images({k: cam_data[k]["image"] for k in [f_left, f_right]}) nav_depth = np.hstack([nav_cam_data[f_right], nav_cam_data[f_left]]) nav_depth = filter_depth(nav_depth, blur_type=None, set_black_value=1.0) diff --git a/vlfm/reality/pointnav_env.py b/vlfm/reality/pointnav_env.py index ee5dfbf..afefbde 100644 --- a/vlfm/reality/pointnav_env.py +++ b/vlfm/reality/pointnav_env.py @@ -41,9 +41,7 @@ def __init__( self._cmd_id: Union[None, Any] = None self._num_steps = 0 - def reset( - self, goal: Any, relative: bool = True, *args: Any, **kwargs: Any - ) -> Dict[str, np.ndarray]: + def reset(self, goal: Any, relative: bool = True, *args: Any, **kwargs: Any) -> Dict[str, np.ndarray]: assert isinstance(goal, np.ndarray) if relative: # Transform (x,y) goal from robot frame to global frame @@ -129,18 +127,12 @@ def _get_obs(self) -> Dict[str, np.ndarray]: } def _get_nav_depth(self) -> np.ndarray: - images = self.robot.get_camera_images( - [SpotCamIds.FRONTRIGHT_DEPTH, SpotCamIds.FRONTLEFT_DEPTH] - ) - img = np.hstack( - [images[SpotCamIds.FRONTRIGHT_DEPTH], images[SpotCamIds.FRONTLEFT_DEPTH]] - ) + images = self.robot.get_camera_images([SpotCamIds.FRONTRIGHT_DEPTH, SpotCamIds.FRONTLEFT_DEPTH]) + img = np.hstack([images[SpotCamIds.FRONTRIGHT_DEPTH], images[SpotCamIds.FRONTLEFT_DEPTH]]) img = self._norm_depth(img) return img - def _norm_depth( - self, depth: np.ndarray, max_depth: Optional[float] = None, scale: bool = True - ) -> np.ndarray: + def _norm_depth(self, depth: np.ndarray, max_depth: Optional[float] = None, scale: bool = True) -> np.ndarray: if max_depth is None: max_depth = self._max_body_cam_depth diff --git a/vlfm/reality/robots/base_robot.py b/vlfm/reality/robots/base_robot.py index 903ebeb..120a8eb 100644 --- a/vlfm/reality/robots/base_robot.py +++ b/vlfm/reality/robots/base_robot.py @@ -103,9 +103,7 @@ def get_camera_images(self, camera_source: List[str]) -> Dict[str, np.ndarray]: """ for source in camera_source: assert source in CAM_ID_TO_SHAPE, f"Invalid camera source: {source}" - images = { - source: np.random.rand(*CAM_ID_TO_SHAPE[source]) for source in camera_source - } + images = {source: np.random.rand(*CAM_ID_TO_SHAPE[source]) for source in camera_source} return self.reorient_images(images) def command_base_velocity(self, ang_vel: float, lin_vel: float) -> None: diff --git a/vlfm/reality/robots/bdsw_robot.py b/vlfm/reality/robots/bdsw_robot.py index 3b958e5..28b6d24 100644 --- a/vlfm/reality/robots/bdsw_robot.py +++ b/vlfm/reality/robots/bdsw_robot.py @@ -25,9 +25,7 @@ def xy_yaw(self) -> Tuple[np.ndarray, float]: def arm_joints(self) -> np.ndarray: """Returns current angle for each of the 6 arm joints in radians""" arm_proprioception = self.spot.get_arm_proprioception() - current_positions = np.array( - [v.position.value for v in arm_proprioception.values()] - ) + current_positions = np.array([v.position.value for v in arm_proprioception.values()]) return current_positions def get_camera_images(self, camera_source: List[str]) -> Dict[str, np.ndarray]: @@ -101,8 +99,7 @@ def get_camera_data(self, srcs: List[str]) -> Dict[str, Dict[str, Any]]: """ image_responses = self.spot.get_image_responses(srcs) imgs = { - src: self._camera_response_to_data(image_response) - for src, image_response in zip(srcs, image_responses) + src: self._camera_response_to_data(image_response) for src, image_response in zip(srcs, image_responses) } return imgs @@ -116,7 +113,5 @@ def _camera_response_to_data(self, response: Any) -> Dict[str, Any]: "image": image, "fx": fx, "fy": fy, - "tf_camera_to_global": self.spot.get_transform( - from_frame=camera_frame, tf_snapshot=tf_snapshot - ), + "tf_camera_to_global": self.spot.get_transform(from_frame=camera_frame, tf_snapshot=tf_snapshot), } diff --git a/vlfm/reality/robots/frame_ids.py b/vlfm/reality/robots/frame_ids.py new file mode 100644 index 0000000..7da70ec --- /dev/null +++ b/vlfm/reality/robots/frame_ids.py @@ -0,0 +1,8 @@ +class SpotFrameIds: + BODY: str = "body" + FLAT_BODY: str = "flat_body" + GPE: str = "gpe" + HAND: str = "hand" + LINK_WR1: str = "link_wr1" + ODOM: str = "odom" + VISION: str = "vision" diff --git a/vlfm/reality/run_bdsw_objnav_env.py b/vlfm/reality/run_bdsw_objnav_env.py index 46edf3e..1bb9004 100644 --- a/vlfm/reality/run_bdsw_objnav_env.py +++ b/vlfm/reality/run_bdsw_objnav_env.py @@ -13,9 +13,7 @@ from vlfm.reality.robots.bdsw_robot import BDSWRobot -@hydra.main( - version_base=None, config_path="../../config/", config_name="experiments/reality" -) +@hydra.main(version_base=None, config_path="../../config/", config_name="experiments/reality") def main(cfg: RealityConfig) -> None: print(OmegaConf.to_yaml(cfg)) policy = RealityITMPolicyV2.from_config(cfg) @@ -27,9 +25,7 @@ def main(cfg: RealityConfig) -> None: robot = BDSWRobot(spot) robot.open_gripper() # robot.set_arm_joints(NOMINAL_ARM_POSE, travel_time=0.75) - cmd_id = robot.spot.move_gripper_to_point( - np.array([0.35, 0.0, 0.6]), np.deg2rad([0.0, 20.0, 0.0]) - ) + cmd_id = robot.spot.move_gripper_to_point(np.array([0.35, 0.0, 0.6]), np.deg2rad([0.0, 20.0, 0.0])) spot.block_until_arm_arrives(cmd_id, timeout_sec=1.5) env = ObjectNavEnv( robot=robot, diff --git a/vlfm/semexp_env/eval.py b/vlfm/semexp_env/eval.py index 3dc846a..05a8398 100644 --- a/vlfm/semexp_env/eval.py +++ b/vlfm/semexp_env/eval.py @@ -65,8 +65,7 @@ def main() -> None: policy_cls = SemExpITMPolicyV3 policy_kwargs["exploration_thresh"] = exp_thresh policy_kwargs["text_prompt"] = ( - "Seems like there is a target_object ahead.|There is a lot of area to" - " explore ahead." + "Seems like there is a target_object ahead.|There is a lot of area to explore ahead." ) else: policy_cls = SemExpITMPolicyV2 @@ -126,17 +125,13 @@ def main() -> None: print("Test successfully completed") -def merge_obs_infos( - obs: torch.Tensor, infos: Tuple[Dict, ...] -) -> Dict[str, torch.Tensor]: +def merge_obs_infos(obs: torch.Tensor, infos: Tuple[Dict, ...]) -> Dict[str, torch.Tensor]: """Merge the observations and infos into a single dictionary.""" rgb = obs[:, :3, ...].permute(0, 2, 3, 1) depth = obs[:, 3:4, ...].permute(0, 2, 3, 1) info_dict = infos[0] - def tensor_from_numpy( - tensor: torch.Tensor, numpy_array: np.ndarray - ) -> torch.Tensor: + def tensor_from_numpy(tensor: torch.Tensor, numpy_array: np.ndarray) -> torch.Tensor: device = tensor.device new_tensor = torch.from_numpy(numpy_array).to(device) return new_tensor @@ -177,9 +172,7 @@ def create_frame(policy_infos: Dict[str, Any]) -> np.ndarray: return vis_img -def generate_video( - frames: List[np.ndarray], ep_id: str, scene_id: str, infos: Dict[str, Any] -) -> None: +def generate_video(frames: List[np.ndarray], ep_id: str, scene_id: str, infos: Dict[str, Any]) -> None: """ Saves the given list of rgb frames as a video at 10 FPS. Uses the infos to get the files name, which should contain the following: @@ -200,8 +193,7 @@ def generate_video( dtg = infos["distance_to_goal"] goal_name = infos["target_object"] filename = ( - f"epid={episode_id:03d}-scid={scene_id}-succ={success}-spl={spl:.2f}" - f"-dtg={dtg:.2f}-target={goal_name}.mp4" + f"epid={episode_id:03d}-scid={scene_id}-succ={success}-spl={spl:.2f}-dtg={dtg:.2f}-target={goal_name}.mp4" ) filename = os.path.join(video_dir, filename) diff --git a/vlfm/utils/episode_stats_logger.py b/vlfm/utils/episode_stats_logger.py index 892180c..f0165d6 100644 --- a/vlfm/utils/episode_stats_logger.py +++ b/vlfm/utils/episode_stats_logger.py @@ -85,9 +85,7 @@ def was_false_positive(infos: Dict[str, Any]) -> bool: """Return whether the point goal target is within a bounding box.""" target_bboxes_mask = infos["top_down_map"]["target_bboxes_mask"] nav_goal_episodic_xy = infos["nav_goal"] - nav_goal_episodic_xyz = np.array( - [nav_goal_episodic_xy[0], nav_goal_episodic_xy[1], 0] - ).reshape(1, 3) + nav_goal_episodic_xyz = np.array([nav_goal_episodic_xy[0], nav_goal_episodic_xy[1], 0]).reshape(1, 3) upper_bound = infos["top_down_map"]["upper_bound"] lower_bound = infos["top_down_map"]["lower_bound"] diff --git a/vlfm/utils/geometry_utils.py b/vlfm/utils/geometry_utils.py index 9a66882..5cc2335 100644 --- a/vlfm/utils/geometry_utils.py +++ b/vlfm/utils/geometry_utils.py @@ -6,9 +6,7 @@ import numpy as np -def rho_theta( - curr_pos: np.ndarray, curr_heading: float, curr_goal: np.ndarray -) -> Tuple[float, float]: +def rho_theta(curr_pos: np.ndarray, curr_heading: float, curr_goal: np.ndarray) -> Tuple[float, float]: """Calculates polar coordinates (rho, theta) relative to a given position and heading to a given goal position. 'rho' is the distance from the agent to the goal, and theta is how many radians the agent must turn (to the left, CCW from above) to @@ -82,15 +80,10 @@ def calculate_vfov(hfov: float, width: int, height: int) -> float: A float representing the VFOV in radians. """ # Calculate the diagonal field of view (DFOV) - dfov = 2 * math.atan( - math.tan(hfov / 2) - * math.sqrt((width**2 + height**2) / (width**2 + height**2)) - ) + dfov = 2 * math.atan(math.tan(hfov / 2) * math.sqrt((width**2 + height**2) / (width**2 + height**2))) # Calculate the vertical field of view (VFOV) - vfov = 2 * math.atan( - math.tan(dfov / 2) * (height / math.sqrt(width**2 + height**2)) - ) + vfov = 2 * math.atan(math.tan(dfov / 2) * (height / math.sqrt(width**2 + height**2))) return vfov @@ -123,9 +116,7 @@ def within_fov_cone( return points[mask] -def convert_to_global_frame( - agent_pos: np.ndarray, agent_yaw: float, local_pos: np.ndarray -) -> np.ndarray: +def convert_to_global_frame(agent_pos: np.ndarray, agent_yaw: float, local_pos: np.ndarray) -> np.ndarray: """Converts a given position from the agent's local frame to the global frame. Args: @@ -189,9 +180,7 @@ def xyz_yaw_to_tf_matrix(xyz: np.ndarray, yaw: float) -> np.ndarray: return transformation_matrix -def closest_point_within_threshold( - points_array: np.ndarray, target_point: np.ndarray, threshold: float -) -> int: +def closest_point_within_threshold(points_array: np.ndarray, target_point: np.ndarray, threshold: float) -> int: """Find the point within the threshold distance that is closest to the target_point. Args: @@ -203,10 +192,7 @@ def closest_point_within_threshold( Returns: int: The index of the closest point within the threshold distance. """ - distances = np.sqrt( - (points_array[:, 0] - target_point[0]) ** 2 - + (points_array[:, 1] - target_point[1]) ** 2 - ) + distances = np.sqrt((points_array[:, 0] - target_point[0]) ** 2 + (points_array[:, 1] - target_point[1]) ** 2) within_threshold = distances <= threshold if np.any(within_threshold): @@ -216,9 +202,7 @@ def closest_point_within_threshold( return -1 -def transform_points( - transformation_matrix: np.ndarray, points: np.ndarray -) -> np.ndarray: +def transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray: # Add a homogeneous coordinate of 1 to each point for matrix multiplication homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1)))) @@ -229,9 +213,7 @@ def transform_points( return transformed_points[:, :3] / transformed_points[:, 3:] -def get_point_cloud( - depth_image: np.ndarray, mask: np.ndarray, fx: float, fy: float -) -> np.ndarray: +def get_point_cloud(depth_image: np.ndarray, mask: np.ndarray, fx: float, fy: float) -> np.ndarray: """Calculates the 3D coordinates (x, y, z) of points in the depth image based on the horizontal field of view (HFOV), the image width and height, the depth values, and the pixel x and y coordinates. diff --git a/vlfm/utils/habitat_visualizer.py b/vlfm/utils/habitat_visualizer.py index 7f622cf..1627674 100644 --- a/vlfm/utils/habitat_visualizer.py +++ b/vlfm/utils/habitat_visualizer.py @@ -67,9 +67,7 @@ def collect_data( # Visualize target point cloud on the map color_point_cloud_on_map(infos, policy_info) - map = maps.colorize_draw_agent_and_fit_to_height( - infos[0]["top_down_map"], self.depth[0].shape[0] - ) + map = maps.colorize_draw_agent_and_fit_to_height(infos[0]["top_down_map"], self.depth[0].shape[0]) self.maps.append(map) vis_map_imgs = [ self._reorient_rescale_habitat_map(infos, policy_info[0][vkey]) @@ -121,9 +119,7 @@ def flush_frames(self, failure_cause: str) -> List[np.ndarray]: return frames @staticmethod - def _reorient_rescale_habitat_map( - infos: List[Dict[str, Any]], vis_map: np.ndarray - ) -> np.ndarray: + def _reorient_rescale_habitat_map(infos: List[Dict[str, Any]], vis_map: np.ndarray) -> np.ndarray: # Rotate the cost map to match the agent's orientation at the start # of the episode start_yaw = infos[0]["start_yaw"] @@ -228,9 +224,7 @@ def sim_xy_to_grid_xy( return grid_xy -def color_point_cloud_on_map( - infos: List[Dict[str, Any]], policy_info: List[Dict[str, Any]] -) -> None: +def color_point_cloud_on_map(infos: List[Dict[str, Any]], policy_info: List[Dict[str, Any]]) -> None: if len(policy_info[0]["target_point_cloud"]) == 0: return @@ -240,9 +234,7 @@ def color_point_cloud_on_map( tf_episodic_to_global = infos[0]["top_down_map"]["tf_episodic_to_global"] cloud_episodic_frame = policy_info[0]["target_point_cloud"][:, :3] - cloud_global_frame_xyz = transform_points( - tf_episodic_to_global, cloud_episodic_frame - ) + cloud_global_frame_xyz = transform_points(tf_episodic_to_global, cloud_episodic_frame) cloud_global_frame_habitat = xyz_to_habitat(cloud_global_frame_xyz) cloud_global_frame_habitat_xy = cloud_global_frame_habitat[:, [2, 0]] diff --git a/vlfm/utils/img_utils.py b/vlfm/utils/img_utils.py index fe5cb0c..fa950d3 100644 --- a/vlfm/utils/img_utils.py +++ b/vlfm/utils/img_utils.py @@ -23,16 +23,12 @@ def rotate_image( height, width = image.shape[0], image.shape[1] center = (width // 2, height // 2) rotation_matrix = cv2.getRotationMatrix2D(center, np.degrees(radians), 1.0) - rotated_image = cv2.warpAffine( - image, rotation_matrix, (width, height), borderValue=border_value - ) + rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height), borderValue=border_value) return rotated_image -def place_img_in_img( - img1: np.ndarray, img2: np.ndarray, row: int, col: int -) -> np.ndarray: +def place_img_in_img(img1: np.ndarray, img2: np.ndarray, row: int, col: int) -> np.ndarray: """Place img2 in img1 such that img2's center is at the specified coordinates (xy) in img1. @@ -44,9 +40,7 @@ def place_img_in_img( Returns: numpy.ndarray: The updated base image with img2 placed. """ - assert ( - 0 <= row < img1.shape[0] and 0 <= col < img1.shape[1] - ), "Pixel location is outside the image." + assert 0 <= row < img1.shape[0] and 0 <= col < img1.shape[1], "Pixel location is outside the image." top = row - img2.shape[0] // 2 left = col - img2.shape[1] // 2 bottom = top + img2.shape[0] @@ -62,9 +56,7 @@ def place_img_in_img( img2_bottom = img2_top + (img1_bottom - img1_top) img2_right = img2_left + (img1_right - img1_left) - img1[img1_top:img1_bottom, img1_left:img1_right] = img2[ - img2_top:img2_bottom, img2_left:img2_right - ] + img1[img1_top:img1_bottom, img1_left:img1_right] = img2[img2_top:img2_bottom, img2_left:img2_right] return img1 @@ -88,16 +80,12 @@ def monochannel_to_inferno_rgb(image: np.ndarray) -> np.ndarray: normalized_image = (image - min_val) / peak_to_peak # Apply the Inferno colormap - inferno_colormap = cv2.applyColorMap( - (normalized_image * 255).astype(np.uint8), cv2.COLORMAP_INFERNO - ) + inferno_colormap = cv2.applyColorMap((normalized_image * 255).astype(np.uint8), cv2.COLORMAP_INFERNO) return inferno_colormap -def resize_images( - images: List[np.ndarray], match_dimension: str = "height", use_max: bool = True -) -> List[np.ndarray]: +def resize_images(images: List[np.ndarray], match_dimension: str = "height", use_max: bool = True) -> List[np.ndarray]: """ Resize images to match either their heights or their widths. @@ -118,18 +106,14 @@ def resize_images( else: new_height = min(img.shape[0] for img in images) resized_images = [ - cv2.resize(img, (int(img.shape[1] * new_height / img.shape[0]), new_height)) - for img in images + cv2.resize(img, (int(img.shape[1] * new_height / img.shape[0]), new_height)) for img in images ] elif match_dimension == "width": if use_max: new_width = max(img.shape[1] for img in images) else: new_width = min(img.shape[1] for img in images) - resized_images = [ - cv2.resize(img, (new_width, int(img.shape[0] * new_width / img.shape[1]))) - for img in images - ] + resized_images = [cv2.resize(img, (new_width, int(img.shape[0] * new_width / img.shape[1]))) for img in images] else: raise ValueError("Invalid 'match_dimension' argument. Use 'height' or 'width'.") @@ -185,9 +169,7 @@ def pad_to_square( height, width, _ = img.shape larger_side = max(height, width) square_size = larger_side + extra_pad - padded_img = np.ones((square_size, square_size, 3), dtype=np.uint8) * np.array( - padding_color, dtype=np.uint8 - ) + padded_img = np.ones((square_size, square_size, 3), dtype=np.uint8) * np.array(padding_color, dtype=np.uint8) padded_img = place_img_in_img(padded_img, img, square_size // 2, square_size // 2) return padded_img @@ -251,8 +233,7 @@ def pixel_value_within_radius( """ # Ensure that the pixel location is within the image assert ( - 0 <= pixel_location[0] < image.shape[0] - and 0 <= pixel_location[1] < image.shape[1] + 0 <= pixel_location[0] < image.shape[0] and 0 <= pixel_location[1] < image.shape[1] ), "Pixel location is outside the image." top_left_x = max(0, pixel_location[0] - radius) @@ -285,9 +266,7 @@ def pixel_value_within_radius( raise ValueError(f"Invalid reduction method: {reduction}") -def median_blur_normalized_depth_image( - depth_image: np.ndarray, ksize: int -) -> np.ndarray: +def median_blur_normalized_depth_image(depth_image: np.ndarray, ksize: int) -> np.ndarray: """Applies a median blur to a normalized depth image. This function first converts the normalized depth image to a uint8 image, @@ -337,9 +316,7 @@ def reorient_rescale_map(vis_map_img: np.ndarray) -> np.ndarray: # Pad the shorter dimension to be the same size as the longer vis_map_img = pad_to_square(vis_map_img, extra_pad=50) # Pad the image border with some white space - vis_map_img = cv2.copyMakeBorder( - vis_map_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=(255, 255, 255) - ) + vis_map_img = cv2.copyMakeBorder(vis_map_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=(255, 255, 255)) return vis_map_img diff --git a/vlfm/utils/log_saver.py b/vlfm/utils/log_saver.py index c1d200a..450f446 100644 --- a/vlfm/utils/log_saver.py +++ b/vlfm/utils/log_saver.py @@ -19,9 +19,7 @@ def log_episode(episode_id: Union[str, int], scene_id: str, data: Dict) -> None: if not (os.path.exists(filename) and os.path.getsize(filename) > 0): print(f"Logging episode {int(episode_id):04d} to {filename}") with open(filename, "w") as f: - json.dump( - {"episode_id": episode_id, "scene_id": scene_id, **data}, f, indent=4 - ) + json.dump({"episode_id": episode_id, "scene_id": scene_id, **data}, f, indent=4) def is_evaluated(episode_id: Union[str, int], scene_id: str) -> bool: diff --git a/vlfm/utils/visualization.py b/vlfm/utils/visualization.py index ac4577c..cfaa749 100644 --- a/vlfm/utils/visualization.py +++ b/vlfm/utils/visualization.py @@ -95,9 +95,7 @@ def generate_text_image(width: int, text: str) -> np.ndarray: return image -def pad_images( - images: List[np.ndarray], pad_from_top: bool = False -) -> List[np.ndarray]: +def pad_images(images: List[np.ndarray], pad_from_top: bool = False) -> List[np.ndarray]: """ Pads a list of images with white pixels to make them have the same dimensions. diff --git a/vlfm/utils/vlfm_trainer.py b/vlfm/utils/vlfm_trainer.py index db344e2..a002b2f 100644 --- a/vlfm/utils/vlfm_trainer.py +++ b/vlfm/utils/vlfm_trainer.py @@ -99,9 +99,7 @@ def _eval_checkpoint( self._init_envs(config, is_eval=True) self._agent = self._create_agent(None) - action_shape, discrete_actions = get_action_space_info( - self._agent.policy_action_space - ) + action_shape, discrete_actions = get_action_space_info(self._agent.policy_action_space) if self._agent.actor_critic.should_load_agent_state: self._agent.load_state_dict(ckpt_dict) @@ -131,14 +129,10 @@ def _eval_checkpoint( device=self.device, dtype=torch.bool, ) - stats_episodes: Dict[Any, Any] = ( - {} - ) # dict of dicts that stores stats per episode + stats_episodes: Dict[Any, Any] = {} # dict of dicts that stores stats per episode ep_eval_count: Dict[Any, int] = defaultdict(lambda: 0) - rgb_frames: List[List[np.ndarray]] = [ - [] for _ in range(self.config.habitat_baselines.num_environments) - ] + rgb_frames: List[List[np.ndarray]] = [[] for _ in range(self.config.habitat_baselines.num_environments)] if len(self.config.habitat_baselines.eval.video_option) > 0: os.makedirs(self.config.habitat_baselines.video_dir, exist_ok=True) @@ -151,16 +145,13 @@ def _eval_checkpoint( # if total_num_eps is negative, it means the number of evaluation episodes is unknown if total_num_eps < number_of_eval_episodes and total_num_eps > 1: logger.warn( - f"Config specified {number_of_eval_episodes} eval episodes" - ", dataset only has {total_num_eps}." + f"Config specified {number_of_eval_episodes} eval episodes, dataset only has {{total_num_eps}}." ) logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps else: assert evals_per_ep == 1 - assert ( - number_of_eval_episodes > 0 - ), "You must specify a number of evaluation episodes with test_episode_count" + assert number_of_eval_episodes > 0, "You must specify a number of evaluation episodes with test_episode_count" pbar = tqdm.tqdm(total=number_of_eval_episodes * evals_per_ep) self._agent.eval() @@ -170,10 +161,7 @@ def _eval_checkpoint( num_successes = 0 num_total = 0 hab_vis = HabitatVis() - while ( - len(stats_episodes) < (number_of_eval_episodes * evals_per_ep) - and self.envs.num_envs > 0 - ): + while len(stats_episodes) < (number_of_eval_episodes * evals_per_ep) and self.envs.num_envs > 0: current_episodes_info = self.envs.current_episodes() with inference_mode(): @@ -202,9 +190,7 @@ def _eval_checkpoint( else: for i, should_insert in enumerate(action_data.should_inserts): if should_insert.item(): - test_recurrent_hidden_states[i] = ( - action_data.rnn_hidden_states[i] - ) + test_recurrent_hidden_states[i] = action_data.rnn_hidden_states[i] prev_actions[i].copy_(action_data.actions[i]) # type: ignore # NB: Move actions to CPU. If CUDA tensors are # sent in to env.step(), that will create CUDA contexts @@ -240,9 +226,7 @@ def _eval_checkpoint( device="cpu", ) - rewards = torch.tensor( - rewards_l, dtype=torch.float, device="cpu" - ).unsqueeze(1) + rewards = torch.tensor(rewards_l, dtype=torch.float, device="cpu").unsqueeze(1) current_episode_reward += rewards next_episodes_info = self.envs.current_episodes() envs_to_pause = [] @@ -281,10 +265,7 @@ def _eval_checkpoint( if episode_stats["success"] == 1: num_successes += 1 num_total += 1 - print( - f"Success rate: {num_successes / num_total * 100:.2f}% " - f"({num_successes} out of {num_total})" - ) + print(f"Success rate: {num_successes / num_total * 100:.2f}% ({num_successes} out of {num_total})") from vlfm.utils.episode_stats_logger import ( log_episode_stats, @@ -358,9 +339,7 @@ def _eval_checkpoint( aggregated_stats = {} for stat_key in next(iter(stats_episodes.values())).keys(): - aggregated_stats[stat_key] = np.mean( - [v[stat_key] for v in stats_episodes.values()] - ) + aggregated_stats[stat_key] = np.mean([v[stat_key] for v in stats_episodes.values()]) for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.4f}") @@ -369,9 +348,7 @@ def _eval_checkpoint( if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] - writer.add_scalar( - "eval_reward/average_reward", aggregated_stats["reward"], step_id - ) + writer.add_scalar("eval_reward/average_reward", aggregated_stats["reward"], step_id) metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} for k, v in metrics.items(): diff --git a/vlfm/vlm/blip2.py b/vlfm/vlm/blip2.py index 204ea07..85803ba 100644 --- a/vlfm/vlm/blip2.py +++ b/vlfm/vlm/blip2.py @@ -46,15 +46,11 @@ def ask(self, image: np.ndarray, prompt: Optional[str] = None) -> str: """ pil_img = Image.fromarray(image) with torch.inference_mode(): - processed_image = ( - self.vis_processors["eval"](pil_img).unsqueeze(0).to(self.device) - ) + processed_image = self.vis_processors["eval"](pil_img).unsqueeze(0).to(self.device) if prompt is None or prompt == "": out = self.model.generate({"image": processed_image})[0] else: - out = self.model.generate({"image": processed_image, "prompt": prompt})[ - 0 - ] + out = self.model.generate({"image": processed_image, "prompt": prompt})[0] return out diff --git a/vlfm/vlm/blip2itm.py b/vlfm/vlm/blip2itm.py index 4715759..6cb1e7d 100644 --- a/vlfm/vlm/blip2itm.py +++ b/vlfm/vlm/blip2itm.py @@ -26,13 +26,11 @@ def __init__( if device is None: device = torch.device("cuda") if torch.cuda.is_available() else "cpu" - self.model, self.vis_processors, self.text_processors = ( - load_model_and_preprocess( - name=name, - model_type=model_type, - is_eval=True, - device=device, - ) + self.model, self.vis_processors, self.text_processors = load_model_and_preprocess( + name=name, + model_type=model_type, + is_eval=True, + device=device, ) self.device = device @@ -51,9 +49,7 @@ def cosine(self, image: np.ndarray, txt: str) -> float: img = self.vis_processors["eval"](pil_img).unsqueeze(0).to(self.device) txt = self.text_processors["eval"](txt) with torch.inference_mode(): - cosine = self.model( - {"image": img, "text_input": txt}, match_head="itc" - ).item() + cosine = self.model({"image": img, "text_input": txt}, match_head="itc").item() return cosine diff --git a/vlfm/vlm/detections.py b/vlfm/vlm/detections.py index 3ec1917..20c823a 100644 --- a/vlfm/vlm/detections.py +++ b/vlfm/vlm/detections.py @@ -76,9 +76,7 @@ def filter_by_class(self, classes: List[str]) -> None: Args: classes (List[str]): List of classes to keep. """ - keep: torch.Tensor = torch.tensor( - [p in classes for p in self.phrases], dtype=torch.bool - ) + keep: torch.Tensor = torch.tensor([p in classes for p in self.phrases], dtype=torch.bool) self._filter(keep) def _filter(self, keep: torch.Tensor) -> None: @@ -162,9 +160,7 @@ def annotate( # If the box appears to be in normalized coordinates, de-normalize using the # image dimensions if box.max() <= 1: - box = box * np.array( - [img.shape[1], img.shape[0], img.shape[1], img.shape[0]] - ) + box = box * np.array([img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) box = box.astype(int) # Draw bounding box @@ -220,15 +216,15 @@ def draw_bounding_box( # reshape to a single dimensional array rand_color = single_pixel.reshape(3) - color = [int(c) for c in rand_color] # type: ignore + bgr_color = [int(c) for c in rand_color] # type: ignore else: # Convert RGB to BGR - bgr_color = color[::-1] - color = [int(c) for c in bgr_color] # type: ignore + color = color[::-1] + bgr_color = [int(c) for c in color] # Draw bounding box on image box_thickness = 2 - cv2.rectangle(img, point1, point2, color, thickness=box_thickness) + cv2.rectangle(img, point1, point2, bgr_color, thickness=box_thickness) # Draw class name and score on image text_label = f"{class_name}: {int(score * 100)}%" @@ -242,7 +238,7 @@ def draw_bounding_box( img, (text_x, text_y - 2 * text_size[1]), (text_x + text_size[0], text_y - text_size[1]), - color, + bgr_color, -1, ) cv2.putText( diff --git a/vlfm/vlm/fiber.py b/vlfm/vlm/fiber.py index e4bc35c..3b17451 100644 --- a/vlfm/vlm/fiber.py +++ b/vlfm/vlm/fiber.py @@ -14,9 +14,7 @@ class FIBER: - def __init__( - self, config_file: str = DEFAULT_CONFIG, weights: str = DEFAULT_WEIGHTS - ): + def __init__(self, config_file: str = DEFAULT_CONFIG, weights: str = DEFAULT_WEIGHTS): cfg.merge_from_file(config_file) cfg.num_gpus = 1 cfg.SOLVER.IMS_PER_BATCH = 1 @@ -32,9 +30,7 @@ def __init__( self.fiber = GLIPDemo(cfg, confidence_threshold=0.2) - def detect( - self, image: np.ndarray, phrase: str, visualize: bool = False - ) -> ObjectDetections: + def detect(self, image: np.ndarray, phrase: str, visualize: bool = False) -> ObjectDetections: """ Given an image and a phrase, this function detects the presence of the most suitable object described by the phrase in the image. The output object's @@ -57,9 +53,7 @@ def detect( """ result = self.fiber.inference(image, phrase) # Normalize result.bbox to be between 0 and 1 - normalized_bbox = result.bbox / torch.tensor( - [image.shape[1], image.shape[0], image.shape[1], image.shape[0]] - ) + normalized_bbox = result.bbox / torch.tensor([image.shape[1], image.shape[0], image.shape[1], image.shape[0]]) dets = ObjectDetections( image_source=image, @@ -76,9 +70,7 @@ class FIBERClient: def __init__(self, url: str = "http://localhost:9080/fiber"): self.url = url - def detect( - self, image: np.ndarray, phrase: str, visualize: bool = False - ) -> ObjectDetections: + def detect(self, image: np.ndarray, phrase: str, visualize: bool = False) -> ObjectDetections: response = send_request(self.url, image=image, phrase=phrase)["response"] detections = ObjectDetections.from_json(response, image_source=image) diff --git a/vlfm/vlm/grounding_dino.py b/vlfm/vlm/grounding_dino.py index cd29b8f..69aadcb 100644 --- a/vlfm/vlm/grounding_dino.py +++ b/vlfm/vlm/grounding_dino.py @@ -13,9 +13,7 @@ try: from groundingdino.util.inference import load_model, predict except ModuleNotFoundError: - print( - "Could not import groundingdino. This is OK if you are only using the client." - ) + print("Could not import groundingdino. This is OK if you are only using the client.") GROUNDING_DINO_CONFIG = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py" GROUNDING_DINO_WEIGHTS = "data/groundingdino_swint_ogc.pth" @@ -32,16 +30,12 @@ def __init__( text_threshold: float = 0.25, device: torch.device = torch.device("cuda"), ): - self.model = load_model( - model_config_path=config_path, model_checkpoint_path=weights_path - ).to(device) + self.model = load_model(model_config_path=config_path, model_checkpoint_path=weights_path).to(device) self.caption = caption self.box_threshold = box_threshold self.text_threshold = text_threshold - def predict( - self, image: np.ndarray, caption: Optional[str] = None - ) -> ObjectDetections: + def predict(self, image: np.ndarray, caption: Optional[str] = None) -> ObjectDetections: """ This function makes predictions on an input image tensor or numpy array using a pretrained model. @@ -57,9 +51,7 @@ def predict( """ # Convert image to tensor and normalize from 0-255 to 0-1 image_tensor = F.to_tensor(image) - image_transformed = F.normalize( - image_tensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] - ) + image_transformed = F.normalize(image_tensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if caption is None: caption_to_use = self.caption else: @@ -86,9 +78,7 @@ class GroundingDINOClient: def __init__(self, port: int = 12181): self.url = f"http://localhost:{port}/gdino" - def predict( - self, image_numpy: np.ndarray, caption: Optional[str] = "" - ) -> ObjectDetections: + def predict(self, image_numpy: np.ndarray, caption: Optional[str] = "") -> ObjectDetections: response = send_request(self.url, image=image_numpy, caption=caption) detections = ObjectDetections.from_json(response, image_source=image_numpy) diff --git a/vlfm/vlm/sam.py b/vlfm/vlm/sam.py index 548b6bd..fd0d7a4 100644 --- a/vlfm/vlm/sam.py +++ b/vlfm/vlm/sam.py @@ -52,9 +52,7 @@ def segment_bbox(self, image: np.ndarray, bbox: List[int]) -> np.ndarray: """ with torch.inference_mode(): self.predictor.set_image(image) - masks, _, _ = self.predictor.predict( - box=np.array(bbox), multimask_output=False - ) + masks, _, _ = self.predictor.predict(box=np.array(bbox), multimask_output=False) return masks[0] @@ -87,9 +85,7 @@ def process_payload(self, payload: dict) -> dict: cropped_mask_str = bool_arr_to_str(cropped_mask) return {"cropped_mask": cropped_mask_str} - mobile_sam = MobileSAMServer( - sam_checkpoint=os.environ.get("MOBILE_SAM_CHECKPOINT", "data/mobile_sam.pt") - ) + mobile_sam = MobileSAMServer(sam_checkpoint=os.environ.get("MOBILE_SAM_CHECKPOINT", "data/mobile_sam.pt")) print("Model loaded!") print(f"Hosting on port {args.port}...") host_model(mobile_sam, name="mobile_sam", port=args.port) diff --git a/vlfm/vlm/yolov7.py b/vlfm/vlm/yolov7.py index 0ec8c4f..809e00f 100644 --- a/vlfm/vlm/yolov7.py +++ b/vlfm/vlm/yolov7.py @@ -28,13 +28,9 @@ class YOLOv7: - def __init__( - self, weights: str, image_size: int = 640, half_precision: bool = True - ): + def __init__(self, weights: str, image_size: int = 640, half_precision: bool = True): """Loads the model and saves it to a field.""" - self.device = ( - torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - ) + self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") self.half_precision = self.device.type != "cpu" and half_precision self.model = attempt_load(weights, map_location=self.device) # load FP32 model stride = int(self.model.stride.max()) # model stride @@ -45,9 +41,7 @@ def __init__( # Warm-up if self.device.type != "cpu": - dummy_img = torch.rand( - 1, 3, int(self.image_size * 0.7), self.image_size - ).to(self.device) + dummy_img = torch.rand(1, 3, int(self.image_size * 0.7), self.image_size).to(self.device) if self.half_precision: dummy_img = dummy_img.half() for i in range(3): @@ -111,9 +105,7 @@ def predict( logits = pred[:, 4] phrases = [COCO_CLASSES[int(i)] for i in pred[:, 5]] - detections = ObjectDetections( - boxes, logits, phrases, image_source=image, fmt="xyxy" - ) + detections = ObjectDetections(boxes, logits, phrases, image_source=image, fmt="xyxy") return detections