From cf4033494dd7d3e883acd3f118971f8373af5bf1 Mon Sep 17 00:00:00 2001
From: Ashay Athalye <aathalye@theaiinstitute.com>
Date: Fri, 13 Sep 2024 16:17:53 -0400
Subject: [PATCH 1/5] Progress towards object detection result.

---
 predicators/envs/spot_env.py                  | 116 +++++++++++++++++-
 predicators/perception/spot_perceiver.py      |   9 +-
 .../spot_utils/perception/object_detection.py | 109 +++++++++++++++-
 .../spot_utils/perception/spot_cameras.py     |  12 +-
 4 files changed, 230 insertions(+), 16 deletions(-)

diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py
index affadd59a9..d0b5f5f3b9 100644
--- a/predicators/envs/spot_env.py
+++ b/predicators/envs/spot_env.py
@@ -25,11 +25,11 @@
 from predicators.spot_utils.perception.object_detection import \
     AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \
     LanguageObjectDetectionID, ObjectDetectionID, detect_objects, \
-    visualize_all_artifacts
+    visualize_all_artifacts, _query_detic_sam2
 from predicators.spot_utils.perception.object_specific_grasp_selection import \
     brush_prompt, bucket_prompt, football_prompt, train_toy_prompt
 from predicators.spot_utils.perception.perception_structs import \
-    RGBDImageWithContext
+    RGBDImageWithContext, RGBDImage, SegmentedBoundingBox
 from predicators.spot_utils.perception.spot_cameras import capture_images, capture_images_without_context
 from predicators.spot_utils.skills.spot_find_objects import \
     init_search_for_objects
@@ -2468,8 +2468,16 @@ def _get_dry_task(self, train_or_test: str,
 
     @property
     def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]:
-        """Get an object from a perception detection ID."""
-        raise NotImplementedError("No dry task for VLMTestEnv.")
+
+        detection_id_to_obj: Dict[ObjectDetectionID, Object] = {}
+        objects = {
+            Object("pan", _movable_object_type),
+            Object("cup", _movable_object_type)
+        }
+        for o in objects:
+            detection_id = LanguageObjectDetectionID(o.name)
+            detection_id_to_obj[detection_id] = o
+        return detection_id_to_obj
 
     def _create_operators(self) -> Iterator[STRIPSOperator]:
         # Pick object
@@ -2553,8 +2561,106 @@ def __init__(self, use_gui: bool = True) -> None:
     def _actively_construct_env_task(self) -> EnvironmentTask:
         assert self._robot is not None
         rgbd_images = capture_images_without_context(self._robot)
+        # import PIL
+        # imgs = [v.rgb for _, v in rgbd_images.items()]
+        # rot_imgs = [v.rotated_rgb for _, v in rgbd_images.items()]
+        # ex1 = PIL.Image.fromarray(imgs[0])
+        # ex2 = PIL.Image.fromarray(rot_imgs[0])
+        # import pdb; pdb.set_trace()
         gripper_open_percentage = get_robot_gripper_open_percentage(self._robot)
         objects_in_view = []
+
+        # Perform object detection.
+        object_ids = self._detection_id_to_obj.keys()
+        ret = _query_detic_sam2(object_ids, rgbd_images)
+        artifacts = {"language": {"rgbds": rgbd_images, "object_id_to_img_detections": ret}}
+        detections_outfile = Path(".") / "object_detection_artifacts.png"
+        no_detections_outfile = Path(".") / "no_detection_artifacts.png"
+        visualize_all_artifacts(artifacts, detections_outfile, no_detections_outfile)
+
+        # Draw object bounding box on images.
+        rgbds = artifacts["language"]["rgbds"]
+        detections = artifacts["language"]["object_id_to_img_detections"]
+        flat_detections: List[Tuple[RGBDImage,
+                                    LanguageObjectDetectionID,
+                                    SegmentedBoundingBox]] = []
+        for obj_id, img_detections in detections.items():
+            for camera, seg_bb in img_detections.items():
+                rgbd = rgbds[camera]
+                flat_detections.append((rgbd, obj_id, seg_bb))
+        
+        # For now assume we only have 1 image, front-left.
+        import pdb; pdb.set_trace()
+        import PIL
+        from PIL import ImageDraw, ImageFont
+        bb_pil_imgs = []
+        img = list(rgbd_images.values())[0].rotated_rgb
+        pil_img = PIL.Image.fromarray(img)
+        draw = ImageDraw.Draw(pil_img)
+        for i, (rgbd, obj_id, seg_bb) in enumerate(flat_detections):
+            # img = rgbd.rotated_rgb
+            # pil_img = PIL.Image.fromarray(img)
+            x0, y0, x1, y1 = seg_bb.bounding_box
+            draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2)
+            text = f"{obj_id.language_id}"
+            font = ImageFont.load_default()
+            # font = utils.get_scaled_default_font(draw, 4)
+            # text_width, text_height = draw.textsize(text, font)
+            # text_width = draw.textlength(text, font)
+            # text_height = font.getsize("hg")[1] 
+            text_mask = font.getmask(text)
+            text_width, text_height = text_mask.size
+            text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)]
+            draw.rectangle(text_bbox, fill='green')
+            draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font)
+
+        import pdb; pdb.set_trace()
+            
+            
+
+            # box = seg_bb.bounding_box
+            # x0, y0 = box[0], box[1]
+            # w, h = box[2] - box[0], box[3] - box[1]
+            # ax_row[3].add_patch(
+            #     plt.Rectangle((x0, y0),
+            #                   w,
+            #                   h,
+            #                   edgecolor='green',
+            #                   facecolor=(0, 0, 0, 0),
+            #                   lw=1))
+            
+            # import PIL
+            # from PIL import ImageDraw
+            # annotated_pil_imgs = []
+            # for img, img_name in zip(imgs, img_names):
+            #     pil_img = PIL.Image.fromarray(img)
+            #     draw = ImageDraw.Draw(pil_img)
+            #     font = utils.get_scaled_default_font(draw, 4)
+            #     annotated_pil_img = utils.add_text_to_draw_img(draw, (0, 0), self.camera_name_to_annotation[img_name], font)
+            #     annotated_pil_imgs.append(pil_img)
+            # annotated_imgs = [np.array(img) for img in annotated_pil_imgs]
+
+            # im = Image.open(image_path)
+            # draw = ImageDraw.Draw(im)
+            # font = ImageFont.load_default()  # You can use a specific font if needed
+
+            # for mask in masks:
+            #     # Assuming you have a function to convert the mask to a PIL Image or polygon
+            #     mask_image = convert_mask_to_pil(mask)
+            #     im.paste(mask_image, (0, 0), mask_image) 
+
+            # for box, class_name, score in zip(input_boxes, classes, scores):
+            #     x0, y0, x1, y1 = box
+            #     draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2)
+            #     text = f"{class_name}: {score:.2f}"
+            #     text_width, text_height = draw.textsize(text, font)
+            #     text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)]
+            #     draw.rectangle(text_bbox, fill='green')
+            #     draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font)
+
+            # im.show()  # Or save it: im.save("output.jpg")
+            # import pdb; pdb.set_trace()
+
         obs = _TruncatedSpotObservation(
             rgbd_images,
             set(objects_in_view),
@@ -2621,8 +2727,8 @@ def step(self, action: Action) -> Observation:
                                 f"was encountered. Trying again.\n{e}")
         rgbd_images = capture_images_without_context(self._robot)
         gripper_open_percentage = get_robot_gripper_open_percentage(self._robot)
-        print(gripper_open_percentage)
         objects_in_view = []
+
         obs = _TruncatedSpotObservation(
             rgbd_images,
             set(objects_in_view),
diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py
index 14c4b5550c..eef7e53c29 100644
--- a/predicators/perception/spot_perceiver.py
+++ b/predicators/perception/spot_perceiver.py
@@ -24,6 +24,11 @@
     GoalDescription, GroundAtom, Object, Observation, Predicate, \
     SpotActionExtraInfo, State, Task, Video
 
+from predicators.spot_utils.perception.object_detection import \
+    AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \
+    LanguageObjectDetectionID, ObjectDetectionID, detect_objects, \
+    visualize_all_artifacts, _query_detic_sam2
+
 
 class SpotPerceiver(BasePerceiver):
     """A perceiver specific to spot envs."""
@@ -683,7 +688,7 @@ def step(self, observation: Observation) -> State:
         imgs = observation.rgbd_images
         img_names = [v.camera_name for _, v in imgs.items()]
         imgs = [v.rgb for _, v in imgs.items()]
-        import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
         import PIL
         from PIL import ImageDraw
         annotated_pil_imgs = []
@@ -694,7 +699,7 @@ def step(self, observation: Observation) -> State:
             annotated_pil_img = utils.add_text_to_draw_img(draw, (0, 0), self.camera_name_to_annotation[img_name], font)
             annotated_pil_imgs.append(pil_img)
         annotated_imgs = [np.array(img) for img in annotated_pil_imgs]
-        import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
         self._gripper_open_percentage = observation.gripper_open_percentage
         self._curr_state = self._create_state()
         self._curr_state.simulator_state["images"] = annotated_imgs
diff --git a/predicators/spot_utils/perception/object_detection.py b/predicators/spot_utils/perception/object_detection.py
index 6757e8f324..94b7581d66 100644
--- a/predicators/spot_utils/perception/object_detection.py
+++ b/predicators/spot_utils/perception/object_detection.py
@@ -41,7 +41,7 @@
 from predicators.spot_utils.perception.perception_structs import \
     AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \
     LanguageObjectDetectionID, ObjectDetectionID, PythonicObjectDetectionID, \
-    RGBDImageWithContext, SegmentedBoundingBox
+    RGBDImageWithContext, RGBDImage, SegmentedBoundingBox
 from predicators.spot_utils.utils import get_april_tag_transform, \
     get_graph_nav_dir
 from predicators.utils import rotate_point_in_image
@@ -351,6 +351,108 @@ def _query_detic_sam(
 
     return object_id_to_img_detections
 
+def _query_detic_sam2(
+    object_ids: Collection[LanguageObjectDetectionID],
+    rgbds: Dict[str, RGBDImage],
+    max_server_retries: int = 5,
+    detection_threshold: float = CFG.spot_vision_detection_threshold
+) -> Dict[ObjectDetectionID, Dict[str, SegmentedBoundingBox]]:
+    """Returns object ID to image ID (camera) to segmented bounding box."""
+
+    object_id_to_img_detections: Dict[ObjectDetectionID,
+                                      Dict[str, SegmentedBoundingBox]] = {
+                                          obj_id: {}
+                                          for obj_id in object_ids
+                                      }
+
+    # Create buffer dictionary to send to server.
+    buf_dict = {}
+    for camera_name, rgbd in rgbds.items():
+        pil_rotated_img = PIL.Image.fromarray(rgbd.rotated_rgb)  # type: ignore
+        buf_dict[camera_name] = _image_to_bytes(pil_rotated_img)
+
+    # Extract all the classes that we want to detect.
+    classes = sorted(o.language_id for o in object_ids)
+
+    # Query server, retrying to handle possible wifi issues.
+    # import pdb; pdb.set_trace()
+    # imgs = [v.rotated_rgb for _, v in rgbds.items()]
+    # pil_img = PIL.Image.fromarray(imgs[0])
+    # import pdb; pdb.set_trace()
+
+    for _ in range(max_server_retries):
+        try:
+            r = requests.post("http://localhost:5550/batch_predict",
+                              files=buf_dict,
+                              data={"classes": ",".join(classes)})
+            break
+        except requests.exceptions.ConnectionError:
+            continue
+    else:
+        logging.warning("DETIC-SAM FAILED, POSSIBLE SERVER/WIFI ISSUE")
+        return object_id_to_img_detections
+
+    # If the status code is not 200, then fail.
+    if r.status_code != 200:
+        logging.warning(f"DETIC-SAM FAILED! STATUS CODE: {r.status_code}")
+        return object_id_to_img_detections
+
+    # Querying the server succeeded; unpack the contents.
+    with io.BytesIO(r.content) as f:
+        try:
+            server_results = np.load(f, allow_pickle=True)
+        # Corrupted results.
+        except pkl.UnpicklingError:
+            logging.warning("DETIC-SAM FAILED DURING UNPICKLING!")
+            return object_id_to_img_detections
+
+        # Process the results and save all detections per object ID.
+        for camera_name, rgbd in rgbds.items():
+            rot_boxes = server_results[f"{camera_name}_boxes"]
+            ret_classes = server_results[f"{camera_name}_classes"]
+            rot_masks = server_results[f"{camera_name}_masks"]
+            scores = server_results[f"{camera_name}_scores"]
+
+            # Invert the rotation immediately so we don't need to worry about
+            # them henceforth.
+            # h, w = rgbd.rgb.shape[:2]
+            # image_rot = rgbd.image_rot
+            # boxes = [
+            #     _rotate_bounding_box(bb, -image_rot, h, w) for bb in rot_boxes
+            # ]
+            # masks = [
+            #     ndimage.rotate(m.squeeze(), -image_rot, reshape=False)
+            #     for m in rot_masks
+            # ]
+            boxes = rot_boxes
+            masks = rot_masks
+
+            # Filter out detections by confidence. We threshold detections
+            # at a set confidence level minimum, and if there are multiple,
+            # we only select the most confident one. This structure makes
+            # it easy for us to select multiple detections if that's ever
+            # necessary in the future.
+            for obj_id in object_ids:
+                # If there were no detections (which means all the
+                # returned values will be numpy arrays of shape (0, 0))
+                # then just skip this source.
+                if ret_classes.size == 0:
+                    continue
+                obj_id_mask = (ret_classes == obj_id.language_id)
+                if not np.any(obj_id_mask):
+                    continue
+                max_score = np.max(scores[obj_id_mask])
+                best_idx = np.where(scores == max_score)[0].item()
+                if scores[best_idx] < detection_threshold:
+                    continue
+                # Save the detection.
+                seg_bb = SegmentedBoundingBox(boxes[best_idx], masks[best_idx],
+                                              scores[best_idx])
+                object_id_to_img_detections[obj_id][rgbd.camera_name] = seg_bb
+
+    import pdb; pdb.set_trace()
+    return object_id_to_img_detections
+
 
 def _image_to_bytes(img: PIL.Image.Image) -> io.BytesIO:
     """Helper function to convert from a PIL image into a bytes object."""
@@ -522,7 +624,8 @@ def visualize_all_artifacts(artifacts: Dict[str,
             ax_row[2].imshow(rgbd.depth, cmap='Greys_r', vmin=0, vmax=10000)
 
             # Bounding box.
-            ax_row[3].imshow(rgbd.rgb)
+            # ax_row[3].imshow(rgbd.rgb)
+            ax_row[3].imshow(rgbd.rotated_rgb)
             box = seg_bb.bounding_box
             x0, y0 = box[0], box[1]
             w, h = box[2] - box[0], box[3] - box[1]
@@ -534,7 +637,7 @@ def visualize_all_artifacts(artifacts: Dict[str,
                               facecolor=(0, 0, 0, 0),
                               lw=1))
 
-            ax_row[4].imshow(seg_bb.mask, cmap="binary_r", vmin=0, vmax=1)
+            # ax_row[4].imshow(seg_bb.mask, cmap="binary_r", vmin=0, vmax=1)
 
             # Labels.
             abbreviated_name = obj_id.language_id
diff --git a/predicators/spot_utils/perception/spot_cameras.py b/predicators/spot_utils/perception/spot_cameras.py
index 0c249a89f4..299fd8db0f 100644
--- a/predicators/spot_utils/perception/spot_cameras.py
+++ b/predicators/spot_utils/perception/spot_cameras.py
@@ -23,12 +23,12 @@
     'right_fisheye_image': 180
 }
 RGB_TO_DEPTH_CAMERAS = {
-    "hand_color_image": "hand_depth_in_hand_color_frame",
-    "left_fisheye_image": "left_depth_in_visual_frame",
-    "right_fisheye_image": "right_depth_in_visual_frame",
+    # "hand_color_image": "hand_depth_in_hand_color_frame",
+    # "left_fisheye_image": "left_depth_in_visual_frame",
+    # "right_fisheye_image": "right_depth_in_visual_frame",
     "frontleft_fisheye_image": "frontleft_depth_in_visual_frame",
-    "frontright_fisheye_image": "frontright_depth_in_visual_frame",
-    "back_fisheye_image": "back_depth_in_visual_frame"
+    # "frontright_fisheye_image": "frontright_depth_in_visual_frame",
+    # "back_fisheye_image": "back_depth_in_visual_frame"
 }
 
 # Hack to avoid double image capturing when we want to (1) get object states
@@ -174,7 +174,7 @@ def capture_images_without_context(
         rgb_img_resp = name_to_response[camera_name]
         depth_img_resp = name_to_response[RGB_TO_DEPTH_CAMERAS[camera_name]]
         rgb_img = _image_response_to_image(rgb_img_resp)
-        rgb_img = ndimage.rotate(rgb_img, ROTATION_ANGLE[camera_name])
+        # rgb_img = ndimage.rotate(rgb_img, ROTATION_ANGLE[camera_name], reshape=False)
         depth_img = _image_response_to_image(depth_img_resp)
         # # Create transform.
         # camera_tform_body = get_a_tform_b(

From 94dff09968ad3fc417ea0ae6ef555887d22b59d7 Mon Sep 17 00:00:00 2001
From: Ashay Athalye <aathalye@theaiinstitute.com>
Date: Sat, 14 Sep 2024 18:16:16 -0400
Subject: [PATCH 2/5] Get camera annotation + object detection annotation to
 work on all images.

---
 predicators/envs/spot_env.py                  | 158 ++++++++----------
 predicators/perception/spot_perceiver.py      |  36 +++-
 .../spot_utils/perception/object_detection.py |   2 +-
 .../spot_utils/perception/spot_cameras.py     |  12 +-
 4 files changed, 106 insertions(+), 102 deletions(-)

diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py
index d0b5f5f3b9..3700100e0c 100644
--- a/predicators/envs/spot_env.py
+++ b/predicators/envs/spot_env.py
@@ -105,6 +105,8 @@ class _TruncatedSpotObservation:
     # # A placeholder until all predicates have classifiers
     # nonpercept_atoms: Set[GroundAtom]
     # nonpercept_predicates: Set[Predicate]
+    # Object detections per camera in self.rgbd_images.
+    object_detections_per_camera: Dict[str, List[Tuple[ObjectDetectionID, SegmentedBoundingBox]]]
 
 
 class _PartialPerceptionState(State):
@@ -2472,7 +2474,9 @@ def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]:
         detection_id_to_obj: Dict[ObjectDetectionID, Object] = {}
         objects = {
             Object("pan", _movable_object_type),
-            Object("cup", _movable_object_type)
+            Object("cup", _movable_object_type),
+            Object("chair", _movable_object_type),
+            Object("bowl", _movable_object_type),
         }
         for o in objects:
             detection_id = LanguageObjectDetectionID(o.name)
@@ -2558,6 +2562,18 @@ def __init__(self, use_gui: bool = True) -> None:
         self._train_tasks = []
         self._test_tasks = []
     
+    def detect_objects(self, rgbd_images: Dict[str, RGBDImage]) -> Dict[str, List[Tuple[ObjectDetectionID, SegmentedBoundingBox]]]:
+        object_ids = self._detection_id_to_obj.keys()
+        object_id_to_img_detections = _query_detic_sam2(object_ids, rgbd_images)
+        # This ^ is currently a mapping of object_id -> camera_name -> SegmentedBoundingBox.
+        # We want to do our annotations by camera image, so let's turn this into a
+        # mapping of camera_name -> object_id -> SegmentedBoundingBox. 
+        detections = {k: [] for k in rgbd_images.keys()}
+        for object_id, d in object_id_to_img_detections.items():
+            for camera_name, seg_bb in d.items():
+                detections[camera_name].append((object_id, seg_bb))
+        return detections
+    
     def _actively_construct_env_task(self) -> EnvironmentTask:
         assert self._robot is not None
         rgbd_images = capture_images_without_context(self._robot)
@@ -2571,103 +2587,60 @@ def _actively_construct_env_task(self) -> EnvironmentTask:
         objects_in_view = []
 
         # Perform object detection.
-        object_ids = self._detection_id_to_obj.keys()
-        ret = _query_detic_sam2(object_ids, rgbd_images)
-        artifacts = {"language": {"rgbds": rgbd_images, "object_id_to_img_detections": ret}}
-        detections_outfile = Path(".") / "object_detection_artifacts.png"
-        no_detections_outfile = Path(".") / "no_detection_artifacts.png"
-        visualize_all_artifacts(artifacts, detections_outfile, no_detections_outfile)
-
-        # Draw object bounding box on images.
-        rgbds = artifacts["language"]["rgbds"]
-        detections = artifacts["language"]["object_id_to_img_detections"]
-        flat_detections: List[Tuple[RGBDImage,
-                                    LanguageObjectDetectionID,
-                                    SegmentedBoundingBox]] = []
-        for obj_id, img_detections in detections.items():
-            for camera, seg_bb in img_detections.items():
-                rgbd = rgbds[camera]
-                flat_detections.append((rgbd, obj_id, seg_bb))
+        object_detections_per_camera = self.detect_objects(rgbd_images)
+
+
+        # artifacts = {"language": {"rgbds": rgbd_images, "object_id_to_img_detections": ret}}
+        # detections_outfile = Path(".") / "object_detection_artifacts.png"
+        # no_detections_outfile = Path(".") / "no_detection_artifacts.png"
+        # visualize_all_artifacts(artifacts, detections_outfile, no_detections_outfile)
+
+        # # Draw object bounding box on images.
+        # rgbds = artifacts["language"]["rgbds"]
+        # detections = artifacts["language"]["object_id_to_img_detections"]
+        # flat_detections: List[Tuple[RGBDImage,
+        #                             LanguageObjectDetectionID,
+        #                             SegmentedBoundingBox]] = []
+        # for obj_id, img_detections in detections.items():
+        #     for camera, seg_bb in img_detections.items():
+        #         rgbd = rgbds[camera]
+        #         flat_detections.append((rgbd, obj_id, seg_bb))
         
-        # For now assume we only have 1 image, front-left.
-        import pdb; pdb.set_trace()
-        import PIL
-        from PIL import ImageDraw, ImageFont
-        bb_pil_imgs = []
-        img = list(rgbd_images.values())[0].rotated_rgb
-        pil_img = PIL.Image.fromarray(img)
-        draw = ImageDraw.Draw(pil_img)
-        for i, (rgbd, obj_id, seg_bb) in enumerate(flat_detections):
-            # img = rgbd.rotated_rgb
-            # pil_img = PIL.Image.fromarray(img)
-            x0, y0, x1, y1 = seg_bb.bounding_box
-            draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2)
-            text = f"{obj_id.language_id}"
-            font = ImageFont.load_default()
-            # font = utils.get_scaled_default_font(draw, 4)
-            # text_width, text_height = draw.textsize(text, font)
-            # text_width = draw.textlength(text, font)
-            # text_height = font.getsize("hg")[1] 
-            text_mask = font.getmask(text)
-            text_width, text_height = text_mask.size
-            text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)]
-            draw.rectangle(text_bbox, fill='green')
-            draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font)
-
-        import pdb; pdb.set_trace()
-            
-            
-
-            # box = seg_bb.bounding_box
-            # x0, y0 = box[0], box[1]
-            # w, h = box[2] - box[0], box[3] - box[1]
-            # ax_row[3].add_patch(
-            #     plt.Rectangle((x0, y0),
-            #                   w,
-            #                   h,
-            #                   edgecolor='green',
-            #                   facecolor=(0, 0, 0, 0),
-            #                   lw=1))
-            
-            # import PIL
-            # from PIL import ImageDraw
-            # annotated_pil_imgs = []
-            # for img, img_name in zip(imgs, img_names):
-            #     pil_img = PIL.Image.fromarray(img)
-            #     draw = ImageDraw.Draw(pil_img)
-            #     font = utils.get_scaled_default_font(draw, 4)
-            #     annotated_pil_img = utils.add_text_to_draw_img(draw, (0, 0), self.camera_name_to_annotation[img_name], font)
-            #     annotated_pil_imgs.append(pil_img)
-            # annotated_imgs = [np.array(img) for img in annotated_pil_imgs]
-
-            # im = Image.open(image_path)
-            # draw = ImageDraw.Draw(im)
-            # font = ImageFont.load_default()  # You can use a specific font if needed
-
-            # for mask in masks:
-            #     # Assuming you have a function to convert the mask to a PIL Image or polygon
-            #     mask_image = convert_mask_to_pil(mask)
-            #     im.paste(mask_image, (0, 0), mask_image) 
-
-            # for box, class_name, score in zip(input_boxes, classes, scores):
-            #     x0, y0, x1, y1 = box
-            #     draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2)
-            #     text = f"{class_name}: {score:.2f}"
-            #     text_width, text_height = draw.textsize(text, font)
-            #     text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)]
-            #     draw.rectangle(text_bbox, fill='green')
-            #     draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font)
-
-            # im.show()  # Or save it: im.save("output.jpg")
-            # import pdb; pdb.set_trace()
+        # # For now assume we only have 1 image, front-left.
+        # import pdb; pdb.set_trace()
+        # import PIL
+        # from PIL import ImageDraw, ImageFont
+        # bb_pil_imgs = []
+        # img = list(rgbd_images.values())[0].rotated_rgb
+        # pil_img = PIL.Image.fromarray(img)
+        # draw = ImageDraw.Draw(pil_img)
+        # for i, (rgbd, obj_id, seg_bb) in enumerate(flat_detections):
+        #     # img = rgbd.rotated_rgb
+        #     # pil_img = PIL.Image.fromarray(img)
+        #     x0, y0, x1, y1 = seg_bb.bounding_box
+        #     draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2)
+        #     text = f"{obj_id.language_id}"
+        #     font = ImageFont.load_default()
+        #     # font = utils.get_scaled_default_font(draw, 4)
+        #     # text_width, text_height = draw.textsize(text, font)
+        #     # text_width = draw.textlength(text, font)
+        #     # text_height = font.getsize("hg")[1] 
+        #     text_mask = font.getmask(text)
+        #     text_width, text_height = text_mask.size
+        #     text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)]
+        #     draw.rectangle(text_bbox, fill='green')
+        #     draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font)
 
+        # import pdb; pdb.set_trace()
+        
         obs = _TruncatedSpotObservation(
             rgbd_images,
             set(objects_in_view),
             set(),
             set(),
             self._spot_object,
-            gripper_open_percentage
+            gripper_open_percentage,
+            object_detections_per_camera
         )
         goal_description = self._generate_goal_description()
         task = EnvironmentTask(obs, goal_description)
@@ -2728,6 +2701,8 @@ def step(self, action: Action) -> Observation:
         rgbd_images = capture_images_without_context(self._robot)
         gripper_open_percentage = get_robot_gripper_open_percentage(self._robot)
         objects_in_view = []
+        # Perform object detection.
+        object_detections_per_camera = self.detect_objects(rgbd_images)
 
         obs = _TruncatedSpotObservation(
             rgbd_images,
@@ -2735,7 +2710,8 @@ def step(self, action: Action) -> Observation:
             set(),
             set(),
             self._spot_object,
-            gripper_open_percentage
+            gripper_open_percentage,
+            object_detections_per_camera
         )
         return obs
 
diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py
index eef7e53c29..1f86189b71 100644
--- a/predicators/perception/spot_perceiver.py
+++ b/predicators/perception/spot_perceiver.py
@@ -683,12 +683,40 @@ def reset(self, env_task: EnvironmentTask) -> Task:
         return Task(state, goal)
 
     def step(self, observation: Observation) -> State:
+        import pdb; pdb.set_trace()
         self._waiting_for_observation = False
         self._robot = observation.robot
-        imgs = observation.rgbd_images
-        img_names = [v.camera_name for _, v in imgs.items()]
-        imgs = [v.rgb for _, v in imgs.items()]
-        # import pdb; pdb.set_trace()
+        img_objects = observation.rgbd_images  # RGBDImage objects
+        img_names = [v.camera_name for _, v in img_objects.items()]
+        imgs = [v.rotated_rgb for _, v in img_objects.items()]
+        import PIL
+        from PIL import ImageDraw, ImageFont
+        pil_imgs = [PIL.Image.fromarray(img) for img in imgs]
+        # Annotate images with detected objects (names + bounding box)
+        # and camera name.
+        object_detections_per_camera = observation.object_detections_per_camera
+        imgs_with_objects_annotated = []  # These are PIL images.
+        for i, camera_name in enumerate(img_names):
+            draw = ImageDraw.Draw(pil_imgs[i])
+            # Annotate with camera name.
+            font = utils.get_scaled_default_font(draw, 4)
+            _ = utils.add_text_to_draw_img(draw, (0, 0), self.camera_name_to_annotation[camera_name], font)
+            # Annotate with object detections.
+            detections = object_detections_per_camera[camera_name]
+            for obj_id, seg_bb in detections:
+                x0, y0, x1, y1 = seg_bb.bounding_box
+                draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2)
+                text = f"{obj_id.language_id}"
+                font = ImageFont.load_default()
+                text_mask = font.getmask(text)
+                text_width, text_height = text_mask.size
+                text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)]
+                draw.rectangle(text_bbox, fill='green')
+                draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font)
+
+        import pdb; pdb.set_trace()
+
+
         import PIL
         from PIL import ImageDraw
         annotated_pil_imgs = []
diff --git a/predicators/spot_utils/perception/object_detection.py b/predicators/spot_utils/perception/object_detection.py
index 94b7581d66..90365f2b02 100644
--- a/predicators/spot_utils/perception/object_detection.py
+++ b/predicators/spot_utils/perception/object_detection.py
@@ -450,7 +450,7 @@ def _query_detic_sam2(
                                               scores[best_idx])
                 object_id_to_img_detections[obj_id][rgbd.camera_name] = seg_bb
 
-    import pdb; pdb.set_trace()
+    # import pdb; pdb.set_trace()
     return object_id_to_img_detections
 
 
diff --git a/predicators/spot_utils/perception/spot_cameras.py b/predicators/spot_utils/perception/spot_cameras.py
index 299fd8db0f..be5131690a 100644
--- a/predicators/spot_utils/perception/spot_cameras.py
+++ b/predicators/spot_utils/perception/spot_cameras.py
@@ -23,12 +23,12 @@
     'right_fisheye_image': 180
 }
 RGB_TO_DEPTH_CAMERAS = {
-    # "hand_color_image": "hand_depth_in_hand_color_frame",
-    # "left_fisheye_image": "left_depth_in_visual_frame",
-    # "right_fisheye_image": "right_depth_in_visual_frame",
+    "hand_color_image": "hand_depth_in_hand_color_frame",
+    "left_fisheye_image": "left_depth_in_visual_frame",
+    "right_fisheye_image": "right_depth_in_visual_frame",
     "frontleft_fisheye_image": "frontleft_depth_in_visual_frame",
-    # "frontright_fisheye_image": "frontright_depth_in_visual_frame",
-    # "back_fisheye_image": "back_depth_in_visual_frame"
+    "frontright_fisheye_image": "frontright_depth_in_visual_frame",
+    "back_fisheye_image": "back_depth_in_visual_frame"
 }
 
 # Hack to avoid double image capturing when we want to (1) get object states
@@ -125,7 +125,7 @@ def capture_images_without_context(
     robot: Robot,
     camera_names: Optional[Collection[str]] = None,
     quality_percent: int = 100,
-) -> Dict[str, RGBDImageWithContext]:
+) -> Dict[str, RGBDImage]:
     """Build an image request and get the responses.
 
     If no camera names are provided, all RGB cameras are used.

From aec3f863604361410d0342a934a6e215b7b6b1d7 Mon Sep 17 00:00:00 2001
From: Ashay Athalye <aathalye@theaiinstitute.com>
Date: Sat, 14 Sep 2024 21:00:21 -0400
Subject: [PATCH 3/5] Update annotation text font and background.

---
 predicators/perception/spot_perceiver.py          |  7 +++----
 predicators/spot_utils/perception/spot_cameras.py | 10 +++++-----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py
index 1f86189b71..6b834521c7 100644
--- a/predicators/perception/spot_perceiver.py
+++ b/predicators/perception/spot_perceiver.py
@@ -695,7 +695,6 @@ def step(self, observation: Observation) -> State:
         # Annotate images with detected objects (names + bounding box)
         # and camera name.
         object_detections_per_camera = observation.object_detections_per_camera
-        imgs_with_objects_annotated = []  # These are PIL images.
         for i, camera_name in enumerate(img_names):
             draw = ImageDraw.Draw(pil_imgs[i])
             # Annotate with camera name.
@@ -707,12 +706,12 @@ def step(self, observation: Observation) -> State:
                 x0, y0, x1, y1 = seg_bb.bounding_box
                 draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2)
                 text = f"{obj_id.language_id}"
-                font = ImageFont.load_default()
+                font = utils.get_scaled_default_font(draw, 3)
                 text_mask = font.getmask(text)
                 text_width, text_height = text_mask.size
-                text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)]
+                text_bbox = [(x0, y0 - 1.5*text_height), (x0 + text_width + 1, y0)]
                 draw.rectangle(text_bbox, fill='green')
-                draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font)
+                draw.text((x0 + 1, y0 - 1.5*text_height), text, fill='white', font=font)
 
         import pdb; pdb.set_trace()
 
diff --git a/predicators/spot_utils/perception/spot_cameras.py b/predicators/spot_utils/perception/spot_cameras.py
index be5131690a..15349535b8 100644
--- a/predicators/spot_utils/perception/spot_cameras.py
+++ b/predicators/spot_utils/perception/spot_cameras.py
@@ -23,12 +23,12 @@
     'right_fisheye_image': 180
 }
 RGB_TO_DEPTH_CAMERAS = {
-    "hand_color_image": "hand_depth_in_hand_color_frame",
-    "left_fisheye_image": "left_depth_in_visual_frame",
-    "right_fisheye_image": "right_depth_in_visual_frame",
+    # "hand_color_image": "hand_depth_in_hand_color_frame",
+    # "left_fisheye_image": "left_depth_in_visual_frame",
+    # "right_fisheye_image": "right_depth_in_visual_frame",
     "frontleft_fisheye_image": "frontleft_depth_in_visual_frame",
-    "frontright_fisheye_image": "frontright_depth_in_visual_frame",
-    "back_fisheye_image": "back_depth_in_visual_frame"
+    # "frontright_fisheye_image": "frontright_depth_in_visual_frame",
+    # "back_fisheye_image": "back_depth_in_visual_frame"
 }
 
 # Hack to avoid double image capturing when we want to (1) get object states

From ac9fc2f69b6ff01832e6c202feebfd359903f211 Mon Sep 17 00:00:00 2001
From: Ashay Athalye <aathalye@theaiinstitute.com>
Date: Sat, 14 Sep 2024 21:07:05 -0400
Subject: [PATCH 4/5] Run autoformatter.

---
 predicators/approaches/bilevel_planning_approach.py   |  1 -
 predicators/envs/spot_env.py                          | 11 ++++++-----
 predicators/ground_truth_models/spot_env/options.py   |  8 ++++----
 predicators/perception/spot_perceiver.py              | 11 ++++-------
 predicators/spot_utils/perception/object_detection.py |  2 +-
 predicators/spot_utils/perception/spot_cameras.py     |  6 +++---
 predicators/spot_utils/spot_localization.py           |  1 -
 7 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/predicators/approaches/bilevel_planning_approach.py b/predicators/approaches/bilevel_planning_approach.py
index 3c02e7dbc5..dbf9eb8a33 100644
--- a/predicators/approaches/bilevel_planning_approach.py
+++ b/predicators/approaches/bilevel_planning_approach.py
@@ -61,7 +61,6 @@ def _solve(self, task: Task, timeout: int) -> Callable[[State], Action]:
         utils.abstract(task.init, preds, self._vlm)
         import pdb; pdb.set_trace()
         # utils.abstract(task.init, preds, self._vlm)
-
         # Run task planning only and then greedily sample and execute in the
         # policy.
         if self._plan_without_sim:
diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py
index 3700100e0c..f7a6798a53 100644
--- a/predicators/envs/spot_env.py
+++ b/predicators/envs/spot_env.py
@@ -24,13 +24,14 @@
 from predicators.settings import CFG
 from predicators.spot_utils.perception.object_detection import \
     AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \
-    LanguageObjectDetectionID, ObjectDetectionID, detect_objects, \
-    visualize_all_artifacts, _query_detic_sam2
+    LanguageObjectDetectionID, ObjectDetectionID, _query_detic_sam2, \
+    detect_objects, visualize_all_artifacts
 from predicators.spot_utils.perception.object_specific_grasp_selection import \
     brush_prompt, bucket_prompt, football_prompt, train_toy_prompt
-from predicators.spot_utils.perception.perception_structs import \
-    RGBDImageWithContext, RGBDImage, SegmentedBoundingBox
-from predicators.spot_utils.perception.spot_cameras import capture_images, capture_images_without_context
+from predicators.spot_utils.perception.perception_structs import RGBDImage, \
+    RGBDImageWithContext, SegmentedBoundingBox
+from predicators.spot_utils.perception.spot_cameras import capture_images, \
+    capture_images_without_context
 from predicators.spot_utils.skills.spot_find_objects import \
     init_search_for_objects
 from predicators.spot_utils.skills.spot_hand_move import \
diff --git a/predicators/ground_truth_models/spot_env/options.py b/predicators/ground_truth_models/spot_env/options.py
index 4e9b7293bf..59749ab03f 100644
--- a/predicators/ground_truth_models/spot_env/options.py
+++ b/predicators/ground_truth_models/spot_env/options.py
@@ -1,22 +1,22 @@
 """Ground-truth options for Spot environments."""
 
+import logging
 import time
 from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple
-import logging
 
 import numpy as np
 import pbrspot
 from bosdyn.client import math_helpers
-from bosdyn.client.sdk import Robot
 from bosdyn.client.lease import LeaseClient
+from bosdyn.client.sdk import Robot
 from gym.spaces import Box
 
 from predicators import utils
 from predicators.envs import get_or_create_env
 from predicators.envs.spot_env import HANDEMPTY_GRIPPER_THRESHOLD, \
     SpotRearrangementEnv, _get_sweeping_surface_for_container, \
-    get_detection_id_for_object, get_robot, get_robot_only, \
-    get_robot_gripper_open_percentage, get_simulated_object, \
+    get_detection_id_for_object, get_robot, \
+    get_robot_gripper_open_percentage, get_robot_only, get_simulated_object, \
     get_simulated_robot
 from predicators.ground_truth_models import GroundTruthOptionFactory
 from predicators.settings import CFG
diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py
index 6b834521c7..b9e56d7199 100644
--- a/predicators/perception/spot_perceiver.py
+++ b/predicators/perception/spot_perceiver.py
@@ -17,6 +17,10 @@
     _PartialPerceptionState, _SpotObservation, in_general_view_classifier
 from predicators.perception.base_perceiver import BasePerceiver
 from predicators.settings import CFG
+from predicators.spot_utils.perception.object_detection import \
+    AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \
+    LanguageObjectDetectionID, ObjectDetectionID, _query_detic_sam2, \
+    detect_objects, visualize_all_artifacts
 from predicators.spot_utils.utils import _container_type, \
     _immovable_object_type, _movable_object_type, _robot_type, \
     get_allowed_map_regions, load_spot_metadata, object_to_top_down_geom
@@ -24,11 +28,6 @@
     GoalDescription, GroundAtom, Object, Observation, Predicate, \
     SpotActionExtraInfo, State, Task, Video
 
-from predicators.spot_utils.perception.object_detection import \
-    AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \
-    LanguageObjectDetectionID, ObjectDetectionID, detect_objects, \
-    visualize_all_artifacts, _query_detic_sam2
-
 
 class SpotPerceiver(BasePerceiver):
     """A perceiver specific to spot envs."""
@@ -714,8 +713,6 @@ def step(self, observation: Observation) -> State:
                 draw.text((x0 + 1, y0 - 1.5*text_height), text, fill='white', font=font)
 
         import pdb; pdb.set_trace()
-
-
         import PIL
         from PIL import ImageDraw
         annotated_pil_imgs = []
diff --git a/predicators/spot_utils/perception/object_detection.py b/predicators/spot_utils/perception/object_detection.py
index 90365f2b02..6b627a90ef 100644
--- a/predicators/spot_utils/perception/object_detection.py
+++ b/predicators/spot_utils/perception/object_detection.py
@@ -41,7 +41,7 @@
 from predicators.spot_utils.perception.perception_structs import \
     AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \
     LanguageObjectDetectionID, ObjectDetectionID, PythonicObjectDetectionID, \
-    RGBDImageWithContext, RGBDImage, SegmentedBoundingBox
+    RGBDImage, RGBDImageWithContext, SegmentedBoundingBox
 from predicators.spot_utils.utils import get_april_tag_transform, \
     get_graph_nav_dir
 from predicators.utils import rotate_point_in_image
diff --git a/predicators/spot_utils/perception/spot_cameras.py b/predicators/spot_utils/perception/spot_cameras.py
index 15349535b8..036dc089ec 100644
--- a/predicators/spot_utils/perception/spot_cameras.py
+++ b/predicators/spot_utils/perception/spot_cameras.py
@@ -2,16 +2,16 @@
 from typing import Collection, Dict, Optional, Type
 
 import cv2
-from scipy import ndimage
 import numpy as np
 from bosdyn.api import image_pb2
 from bosdyn.client.frame_helpers import BODY_FRAME_NAME, get_a_tform_b
 from bosdyn.client.image import ImageClient, build_image_request
 from bosdyn.client.sdk import Robot
 from numpy.typing import NDArray
+from scipy import ndimage
 
-from predicators.spot_utils.perception.perception_structs import \
-    RGBDImageWithContext, RGBDImage
+from predicators.spot_utils.perception.perception_structs import RGBDImage, \
+    RGBDImageWithContext
 from predicators.spot_utils.spot_localization import SpotLocalizer
 
 ROTATION_ANGLE = {
diff --git a/predicators/spot_utils/spot_localization.py b/predicators/spot_utils/spot_localization.py
index 27283b248e..145178cd7a 100644
--- a/predicators/spot_utils/spot_localization.py
+++ b/predicators/spot_utils/spot_localization.py
@@ -74,7 +74,6 @@ def __init__(self, robot: Robot, upload_path: Path,
         #             raise LocalizationFailure(msg)
         #         logging.warning("Localization failed once, retrying.")
         #         time.sleep(LOCALIZATION_RETRY_WAIT_TIME)
-
         # # Run localize once to start.
         # self.localize()
 

From 5a461cc7fb24cfd9a09940453dc6e30969c04bbd Mon Sep 17 00:00:00 2001
From: Ashay Athalye <aathalye@theaiinstitute.com>
Date: Mon, 16 Sep 2024 14:30:31 -0400
Subject: [PATCH 5/5] Fix indent.

---
 predicators/envs/spot_env.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py
index f7a6798a53..374c0114a6 100644
--- a/predicators/envs/spot_env.py
+++ b/predicators/envs/spot_env.py
@@ -190,15 +190,15 @@ def get_robot(
 
 @functools.lru_cache(maxsize=None)
 def get_robot_only() -> Tuple[Optional[Robot], Optional[LeaseClient]]:
-        hostname = CFG.spot_robot_ip
-        sdk = create_standard_sdk("PredicatorsClient-")
-        robot = sdk.create_robot(hostname)
-        robot.authenticate("user", "bbbdddaaaiii")
-        verify_estop(robot)
-        lease_client = robot.ensure_client(LeaseClient.default_service_name)
-        lease_client.take()
-        lease_keepalive = LeaseKeepAlive(lease_client, must_acquire=True, return_at_exit=True)
-        return robot, lease_client
+    hostname = CFG.spot_robot_ip
+    sdk = create_standard_sdk("PredicatorsClient-")
+    robot = sdk.create_robot(hostname)
+    robot.authenticate("user", "bbbdddaaaiii")
+    verify_estop(robot)
+    lease_client = robot.ensure_client(LeaseClient.default_service_name)
+    lease_client.take()
+    lease_keepalive = LeaseKeepAlive(lease_client, must_acquire=True, return_at_exit=True)
+    return robot, lease_client
 
 
 @functools.lru_cache(maxsize=None)