From cf4033494dd7d3e883acd3f118971f8373af5bf1 Mon Sep 17 00:00:00 2001 From: Ashay Athalye Date: Fri, 13 Sep 2024 16:17:53 -0400 Subject: [PATCH 1/5] Progress towards object detection result. --- predicators/envs/spot_env.py | 116 +++++++++++++++++- predicators/perception/spot_perceiver.py | 9 +- .../spot_utils/perception/object_detection.py | 109 +++++++++++++++- .../spot_utils/perception/spot_cameras.py | 12 +- 4 files changed, 230 insertions(+), 16 deletions(-) diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py index affadd59a9..d0b5f5f3b9 100644 --- a/predicators/envs/spot_env.py +++ b/predicators/envs/spot_env.py @@ -25,11 +25,11 @@ from predicators.spot_utils.perception.object_detection import \ AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \ LanguageObjectDetectionID, ObjectDetectionID, detect_objects, \ - visualize_all_artifacts + visualize_all_artifacts, _query_detic_sam2 from predicators.spot_utils.perception.object_specific_grasp_selection import \ brush_prompt, bucket_prompt, football_prompt, train_toy_prompt from predicators.spot_utils.perception.perception_structs import \ - RGBDImageWithContext + RGBDImageWithContext, RGBDImage, SegmentedBoundingBox from predicators.spot_utils.perception.spot_cameras import capture_images, capture_images_without_context from predicators.spot_utils.skills.spot_find_objects import \ init_search_for_objects @@ -2468,8 +2468,16 @@ def _get_dry_task(self, train_or_test: str, @property def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]: - """Get an object from a perception detection ID.""" - raise NotImplementedError("No dry task for VLMTestEnv.") + + detection_id_to_obj: Dict[ObjectDetectionID, Object] = {} + objects = { + Object("pan", _movable_object_type), + Object("cup", _movable_object_type) + } + for o in objects: + detection_id = LanguageObjectDetectionID(o.name) + detection_id_to_obj[detection_id] = o + return detection_id_to_obj def _create_operators(self) -> Iterator[STRIPSOperator]: # Pick object @@ -2553,8 +2561,106 @@ def __init__(self, use_gui: bool = True) -> None: def _actively_construct_env_task(self) -> EnvironmentTask: assert self._robot is not None rgbd_images = capture_images_without_context(self._robot) + # import PIL + # imgs = [v.rgb for _, v in rgbd_images.items()] + # rot_imgs = [v.rotated_rgb for _, v in rgbd_images.items()] + # ex1 = PIL.Image.fromarray(imgs[0]) + # ex2 = PIL.Image.fromarray(rot_imgs[0]) + # import pdb; pdb.set_trace() gripper_open_percentage = get_robot_gripper_open_percentage(self._robot) objects_in_view = [] + + # Perform object detection. + object_ids = self._detection_id_to_obj.keys() + ret = _query_detic_sam2(object_ids, rgbd_images) + artifacts = {"language": {"rgbds": rgbd_images, "object_id_to_img_detections": ret}} + detections_outfile = Path(".") / "object_detection_artifacts.png" + no_detections_outfile = Path(".") / "no_detection_artifacts.png" + visualize_all_artifacts(artifacts, detections_outfile, no_detections_outfile) + + # Draw object bounding box on images. + rgbds = artifacts["language"]["rgbds"] + detections = artifacts["language"]["object_id_to_img_detections"] + flat_detections: List[Tuple[RGBDImage, + LanguageObjectDetectionID, + SegmentedBoundingBox]] = [] + for obj_id, img_detections in detections.items(): + for camera, seg_bb in img_detections.items(): + rgbd = rgbds[camera] + flat_detections.append((rgbd, obj_id, seg_bb)) + + # For now assume we only have 1 image, front-left. + import pdb; pdb.set_trace() + import PIL + from PIL import ImageDraw, ImageFont + bb_pil_imgs = [] + img = list(rgbd_images.values())[0].rotated_rgb + pil_img = PIL.Image.fromarray(img) + draw = ImageDraw.Draw(pil_img) + for i, (rgbd, obj_id, seg_bb) in enumerate(flat_detections): + # img = rgbd.rotated_rgb + # pil_img = PIL.Image.fromarray(img) + x0, y0, x1, y1 = seg_bb.bounding_box + draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2) + text = f"{obj_id.language_id}" + font = ImageFont.load_default() + # font = utils.get_scaled_default_font(draw, 4) + # text_width, text_height = draw.textsize(text, font) + # text_width = draw.textlength(text, font) + # text_height = font.getsize("hg")[1] + text_mask = font.getmask(text) + text_width, text_height = text_mask.size + text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)] + draw.rectangle(text_bbox, fill='green') + draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font) + + import pdb; pdb.set_trace() + + + + # box = seg_bb.bounding_box + # x0, y0 = box[0], box[1] + # w, h = box[2] - box[0], box[3] - box[1] + # ax_row[3].add_patch( + # plt.Rectangle((x0, y0), + # w, + # h, + # edgecolor='green', + # facecolor=(0, 0, 0, 0), + # lw=1)) + + # import PIL + # from PIL import ImageDraw + # annotated_pil_imgs = [] + # for img, img_name in zip(imgs, img_names): + # pil_img = PIL.Image.fromarray(img) + # draw = ImageDraw.Draw(pil_img) + # font = utils.get_scaled_default_font(draw, 4) + # annotated_pil_img = utils.add_text_to_draw_img(draw, (0, 0), self.camera_name_to_annotation[img_name], font) + # annotated_pil_imgs.append(pil_img) + # annotated_imgs = [np.array(img) for img in annotated_pil_imgs] + + # im = Image.open(image_path) + # draw = ImageDraw.Draw(im) + # font = ImageFont.load_default() # You can use a specific font if needed + + # for mask in masks: + # # Assuming you have a function to convert the mask to a PIL Image or polygon + # mask_image = convert_mask_to_pil(mask) + # im.paste(mask_image, (0, 0), mask_image) + + # for box, class_name, score in zip(input_boxes, classes, scores): + # x0, y0, x1, y1 = box + # draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2) + # text = f"{class_name}: {score:.2f}" + # text_width, text_height = draw.textsize(text, font) + # text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)] + # draw.rectangle(text_bbox, fill='green') + # draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font) + + # im.show() # Or save it: im.save("output.jpg") + # import pdb; pdb.set_trace() + obs = _TruncatedSpotObservation( rgbd_images, set(objects_in_view), @@ -2621,8 +2727,8 @@ def step(self, action: Action) -> Observation: f"was encountered. Trying again.\n{e}") rgbd_images = capture_images_without_context(self._robot) gripper_open_percentage = get_robot_gripper_open_percentage(self._robot) - print(gripper_open_percentage) objects_in_view = [] + obs = _TruncatedSpotObservation( rgbd_images, set(objects_in_view), diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py index 14c4b5550c..eef7e53c29 100644 --- a/predicators/perception/spot_perceiver.py +++ b/predicators/perception/spot_perceiver.py @@ -24,6 +24,11 @@ GoalDescription, GroundAtom, Object, Observation, Predicate, \ SpotActionExtraInfo, State, Task, Video +from predicators.spot_utils.perception.object_detection import \ + AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \ + LanguageObjectDetectionID, ObjectDetectionID, detect_objects, \ + visualize_all_artifacts, _query_detic_sam2 + class SpotPerceiver(BasePerceiver): """A perceiver specific to spot envs.""" @@ -683,7 +688,7 @@ def step(self, observation: Observation) -> State: imgs = observation.rgbd_images img_names = [v.camera_name for _, v in imgs.items()] imgs = [v.rgb for _, v in imgs.items()] - import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() import PIL from PIL import ImageDraw annotated_pil_imgs = [] @@ -694,7 +699,7 @@ def step(self, observation: Observation) -> State: annotated_pil_img = utils.add_text_to_draw_img(draw, (0, 0), self.camera_name_to_annotation[img_name], font) annotated_pil_imgs.append(pil_img) annotated_imgs = [np.array(img) for img in annotated_pil_imgs] - import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() self._gripper_open_percentage = observation.gripper_open_percentage self._curr_state = self._create_state() self._curr_state.simulator_state["images"] = annotated_imgs diff --git a/predicators/spot_utils/perception/object_detection.py b/predicators/spot_utils/perception/object_detection.py index 6757e8f324..94b7581d66 100644 --- a/predicators/spot_utils/perception/object_detection.py +++ b/predicators/spot_utils/perception/object_detection.py @@ -41,7 +41,7 @@ from predicators.spot_utils.perception.perception_structs import \ AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \ LanguageObjectDetectionID, ObjectDetectionID, PythonicObjectDetectionID, \ - RGBDImageWithContext, SegmentedBoundingBox + RGBDImageWithContext, RGBDImage, SegmentedBoundingBox from predicators.spot_utils.utils import get_april_tag_transform, \ get_graph_nav_dir from predicators.utils import rotate_point_in_image @@ -351,6 +351,108 @@ def _query_detic_sam( return object_id_to_img_detections +def _query_detic_sam2( + object_ids: Collection[LanguageObjectDetectionID], + rgbds: Dict[str, RGBDImage], + max_server_retries: int = 5, + detection_threshold: float = CFG.spot_vision_detection_threshold +) -> Dict[ObjectDetectionID, Dict[str, SegmentedBoundingBox]]: + """Returns object ID to image ID (camera) to segmented bounding box.""" + + object_id_to_img_detections: Dict[ObjectDetectionID, + Dict[str, SegmentedBoundingBox]] = { + obj_id: {} + for obj_id in object_ids + } + + # Create buffer dictionary to send to server. + buf_dict = {} + for camera_name, rgbd in rgbds.items(): + pil_rotated_img = PIL.Image.fromarray(rgbd.rotated_rgb) # type: ignore + buf_dict[camera_name] = _image_to_bytes(pil_rotated_img) + + # Extract all the classes that we want to detect. + classes = sorted(o.language_id for o in object_ids) + + # Query server, retrying to handle possible wifi issues. + # import pdb; pdb.set_trace() + # imgs = [v.rotated_rgb for _, v in rgbds.items()] + # pil_img = PIL.Image.fromarray(imgs[0]) + # import pdb; pdb.set_trace() + + for _ in range(max_server_retries): + try: + r = requests.post("http://localhost:5550/batch_predict", + files=buf_dict, + data={"classes": ",".join(classes)}) + break + except requests.exceptions.ConnectionError: + continue + else: + logging.warning("DETIC-SAM FAILED, POSSIBLE SERVER/WIFI ISSUE") + return object_id_to_img_detections + + # If the status code is not 200, then fail. + if r.status_code != 200: + logging.warning(f"DETIC-SAM FAILED! STATUS CODE: {r.status_code}") + return object_id_to_img_detections + + # Querying the server succeeded; unpack the contents. + with io.BytesIO(r.content) as f: + try: + server_results = np.load(f, allow_pickle=True) + # Corrupted results. + except pkl.UnpicklingError: + logging.warning("DETIC-SAM FAILED DURING UNPICKLING!") + return object_id_to_img_detections + + # Process the results and save all detections per object ID. + for camera_name, rgbd in rgbds.items(): + rot_boxes = server_results[f"{camera_name}_boxes"] + ret_classes = server_results[f"{camera_name}_classes"] + rot_masks = server_results[f"{camera_name}_masks"] + scores = server_results[f"{camera_name}_scores"] + + # Invert the rotation immediately so we don't need to worry about + # them henceforth. + # h, w = rgbd.rgb.shape[:2] + # image_rot = rgbd.image_rot + # boxes = [ + # _rotate_bounding_box(bb, -image_rot, h, w) for bb in rot_boxes + # ] + # masks = [ + # ndimage.rotate(m.squeeze(), -image_rot, reshape=False) + # for m in rot_masks + # ] + boxes = rot_boxes + masks = rot_masks + + # Filter out detections by confidence. We threshold detections + # at a set confidence level minimum, and if there are multiple, + # we only select the most confident one. This structure makes + # it easy for us to select multiple detections if that's ever + # necessary in the future. + for obj_id in object_ids: + # If there were no detections (which means all the + # returned values will be numpy arrays of shape (0, 0)) + # then just skip this source. + if ret_classes.size == 0: + continue + obj_id_mask = (ret_classes == obj_id.language_id) + if not np.any(obj_id_mask): + continue + max_score = np.max(scores[obj_id_mask]) + best_idx = np.where(scores == max_score)[0].item() + if scores[best_idx] < detection_threshold: + continue + # Save the detection. + seg_bb = SegmentedBoundingBox(boxes[best_idx], masks[best_idx], + scores[best_idx]) + object_id_to_img_detections[obj_id][rgbd.camera_name] = seg_bb + + import pdb; pdb.set_trace() + return object_id_to_img_detections + def _image_to_bytes(img: PIL.Image.Image) -> io.BytesIO: """Helper function to convert from a PIL image into a bytes object.""" @@ -522,7 +624,8 @@ def visualize_all_artifacts(artifacts: Dict[str, ax_row[2].imshow(rgbd.depth, cmap='Greys_r', vmin=0, vmax=10000) # Bounding box. - ax_row[3].imshow(rgbd.rgb) + # ax_row[3].imshow(rgbd.rgb) + ax_row[3].imshow(rgbd.rotated_rgb) box = seg_bb.bounding_box x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] @@ -534,7 +637,7 @@ def visualize_all_artifacts(artifacts: Dict[str, facecolor=(0, 0, 0, 0), lw=1)) - ax_row[4].imshow(seg_bb.mask, cmap="binary_r", vmin=0, vmax=1) + # ax_row[4].imshow(seg_bb.mask, cmap="binary_r", vmin=0, vmax=1) # Labels. abbreviated_name = obj_id.language_id diff --git a/predicators/spot_utils/perception/spot_cameras.py b/predicators/spot_utils/perception/spot_cameras.py index 0c249a89f4..299fd8db0f 100644 --- a/predicators/spot_utils/perception/spot_cameras.py +++ b/predicators/spot_utils/perception/spot_cameras.py @@ -23,12 +23,12 @@ 'right_fisheye_image': 180 } RGB_TO_DEPTH_CAMERAS = { - "hand_color_image": "hand_depth_in_hand_color_frame", - "left_fisheye_image": "left_depth_in_visual_frame", - "right_fisheye_image": "right_depth_in_visual_frame", + # "hand_color_image": "hand_depth_in_hand_color_frame", + # "left_fisheye_image": "left_depth_in_visual_frame", + # "right_fisheye_image": "right_depth_in_visual_frame", "frontleft_fisheye_image": "frontleft_depth_in_visual_frame", - "frontright_fisheye_image": "frontright_depth_in_visual_frame", - "back_fisheye_image": "back_depth_in_visual_frame" + # "frontright_fisheye_image": "frontright_depth_in_visual_frame", + # "back_fisheye_image": "back_depth_in_visual_frame" } # Hack to avoid double image capturing when we want to (1) get object states @@ -174,7 +174,7 @@ def capture_images_without_context( rgb_img_resp = name_to_response[camera_name] depth_img_resp = name_to_response[RGB_TO_DEPTH_CAMERAS[camera_name]] rgb_img = _image_response_to_image(rgb_img_resp) - rgb_img = ndimage.rotate(rgb_img, ROTATION_ANGLE[camera_name]) + # rgb_img = ndimage.rotate(rgb_img, ROTATION_ANGLE[camera_name], reshape=False) depth_img = _image_response_to_image(depth_img_resp) # # Create transform. # camera_tform_body = get_a_tform_b( From 94dff09968ad3fc417ea0ae6ef555887d22b59d7 Mon Sep 17 00:00:00 2001 From: Ashay Athalye Date: Sat, 14 Sep 2024 18:16:16 -0400 Subject: [PATCH 2/5] Get camera annotation + object detection annotation to work on all images. --- predicators/envs/spot_env.py | 158 ++++++++---------- predicators/perception/spot_perceiver.py | 36 +++- .../spot_utils/perception/object_detection.py | 2 +- .../spot_utils/perception/spot_cameras.py | 12 +- 4 files changed, 106 insertions(+), 102 deletions(-) diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py index d0b5f5f3b9..3700100e0c 100644 --- a/predicators/envs/spot_env.py +++ b/predicators/envs/spot_env.py @@ -105,6 +105,8 @@ class _TruncatedSpotObservation: # # A placeholder until all predicates have classifiers # nonpercept_atoms: Set[GroundAtom] # nonpercept_predicates: Set[Predicate] + # Object detections per camera in self.rgbd_images. + object_detections_per_camera: Dict[str, List[Tuple[ObjectDetectionID, SegmentedBoundingBox]]] class _PartialPerceptionState(State): @@ -2472,7 +2474,9 @@ def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]: detection_id_to_obj: Dict[ObjectDetectionID, Object] = {} objects = { Object("pan", _movable_object_type), - Object("cup", _movable_object_type) + Object("cup", _movable_object_type), + Object("chair", _movable_object_type), + Object("bowl", _movable_object_type), } for o in objects: detection_id = LanguageObjectDetectionID(o.name) @@ -2558,6 +2562,18 @@ def __init__(self, use_gui: bool = True) -> None: self._train_tasks = [] self._test_tasks = [] + def detect_objects(self, rgbd_images: Dict[str, RGBDImage]) -> Dict[str, List[Tuple[ObjectDetectionID, SegmentedBoundingBox]]]: + object_ids = self._detection_id_to_obj.keys() + object_id_to_img_detections = _query_detic_sam2(object_ids, rgbd_images) + # This ^ is currently a mapping of object_id -> camera_name -> SegmentedBoundingBox. + # We want to do our annotations by camera image, so let's turn this into a + # mapping of camera_name -> object_id -> SegmentedBoundingBox. + detections = {k: [] for k in rgbd_images.keys()} + for object_id, d in object_id_to_img_detections.items(): + for camera_name, seg_bb in d.items(): + detections[camera_name].append((object_id, seg_bb)) + return detections + def _actively_construct_env_task(self) -> EnvironmentTask: assert self._robot is not None rgbd_images = capture_images_without_context(self._robot) @@ -2571,103 +2587,60 @@ def _actively_construct_env_task(self) -> EnvironmentTask: objects_in_view = [] # Perform object detection. - object_ids = self._detection_id_to_obj.keys() - ret = _query_detic_sam2(object_ids, rgbd_images) - artifacts = {"language": {"rgbds": rgbd_images, "object_id_to_img_detections": ret}} - detections_outfile = Path(".") / "object_detection_artifacts.png" - no_detections_outfile = Path(".") / "no_detection_artifacts.png" - visualize_all_artifacts(artifacts, detections_outfile, no_detections_outfile) - - # Draw object bounding box on images. - rgbds = artifacts["language"]["rgbds"] - detections = artifacts["language"]["object_id_to_img_detections"] - flat_detections: List[Tuple[RGBDImage, - LanguageObjectDetectionID, - SegmentedBoundingBox]] = [] - for obj_id, img_detections in detections.items(): - for camera, seg_bb in img_detections.items(): - rgbd = rgbds[camera] - flat_detections.append((rgbd, obj_id, seg_bb)) + object_detections_per_camera = self.detect_objects(rgbd_images) + + + # artifacts = {"language": {"rgbds": rgbd_images, "object_id_to_img_detections": ret}} + # detections_outfile = Path(".") / "object_detection_artifacts.png" + # no_detections_outfile = Path(".") / "no_detection_artifacts.png" + # visualize_all_artifacts(artifacts, detections_outfile, no_detections_outfile) + + # # Draw object bounding box on images. + # rgbds = artifacts["language"]["rgbds"] + # detections = artifacts["language"]["object_id_to_img_detections"] + # flat_detections: List[Tuple[RGBDImage, + # LanguageObjectDetectionID, + # SegmentedBoundingBox]] = [] + # for obj_id, img_detections in detections.items(): + # for camera, seg_bb in img_detections.items(): + # rgbd = rgbds[camera] + # flat_detections.append((rgbd, obj_id, seg_bb)) - # For now assume we only have 1 image, front-left. - import pdb; pdb.set_trace() - import PIL - from PIL import ImageDraw, ImageFont - bb_pil_imgs = [] - img = list(rgbd_images.values())[0].rotated_rgb - pil_img = PIL.Image.fromarray(img) - draw = ImageDraw.Draw(pil_img) - for i, (rgbd, obj_id, seg_bb) in enumerate(flat_detections): - # img = rgbd.rotated_rgb - # pil_img = PIL.Image.fromarray(img) - x0, y0, x1, y1 = seg_bb.bounding_box - draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2) - text = f"{obj_id.language_id}" - font = ImageFont.load_default() - # font = utils.get_scaled_default_font(draw, 4) - # text_width, text_height = draw.textsize(text, font) - # text_width = draw.textlength(text, font) - # text_height = font.getsize("hg")[1] - text_mask = font.getmask(text) - text_width, text_height = text_mask.size - text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)] - draw.rectangle(text_bbox, fill='green') - draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font) - - import pdb; pdb.set_trace() - - - - # box = seg_bb.bounding_box - # x0, y0 = box[0], box[1] - # w, h = box[2] - box[0], box[3] - box[1] - # ax_row[3].add_patch( - # plt.Rectangle((x0, y0), - # w, - # h, - # edgecolor='green', - # facecolor=(0, 0, 0, 0), - # lw=1)) - - # import PIL - # from PIL import ImageDraw - # annotated_pil_imgs = [] - # for img, img_name in zip(imgs, img_names): - # pil_img = PIL.Image.fromarray(img) - # draw = ImageDraw.Draw(pil_img) - # font = utils.get_scaled_default_font(draw, 4) - # annotated_pil_img = utils.add_text_to_draw_img(draw, (0, 0), self.camera_name_to_annotation[img_name], font) - # annotated_pil_imgs.append(pil_img) - # annotated_imgs = [np.array(img) for img in annotated_pil_imgs] - - # im = Image.open(image_path) - # draw = ImageDraw.Draw(im) - # font = ImageFont.load_default() # You can use a specific font if needed - - # for mask in masks: - # # Assuming you have a function to convert the mask to a PIL Image or polygon - # mask_image = convert_mask_to_pil(mask) - # im.paste(mask_image, (0, 0), mask_image) - - # for box, class_name, score in zip(input_boxes, classes, scores): - # x0, y0, x1, y1 = box - # draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2) - # text = f"{class_name}: {score:.2f}" - # text_width, text_height = draw.textsize(text, font) - # text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)] - # draw.rectangle(text_bbox, fill='green') - # draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font) - - # im.show() # Or save it: im.save("output.jpg") - # import pdb; pdb.set_trace() + # # For now assume we only have 1 image, front-left. + # import pdb; pdb.set_trace() + # import PIL + # from PIL import ImageDraw, ImageFont + # bb_pil_imgs = [] + # img = list(rgbd_images.values())[0].rotated_rgb + # pil_img = PIL.Image.fromarray(img) + # draw = ImageDraw.Draw(pil_img) + # for i, (rgbd, obj_id, seg_bb) in enumerate(flat_detections): + # # img = rgbd.rotated_rgb + # # pil_img = PIL.Image.fromarray(img) + # x0, y0, x1, y1 = seg_bb.bounding_box + # draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2) + # text = f"{obj_id.language_id}" + # font = ImageFont.load_default() + # # font = utils.get_scaled_default_font(draw, 4) + # # text_width, text_height = draw.textsize(text, font) + # # text_width = draw.textlength(text, font) + # # text_height = font.getsize("hg")[1] + # text_mask = font.getmask(text) + # text_width, text_height = text_mask.size + # text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)] + # draw.rectangle(text_bbox, fill='green') + # draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font) + # import pdb; pdb.set_trace() + obs = _TruncatedSpotObservation( rgbd_images, set(objects_in_view), set(), set(), self._spot_object, - gripper_open_percentage + gripper_open_percentage, + object_detections_per_camera ) goal_description = self._generate_goal_description() task = EnvironmentTask(obs, goal_description) @@ -2728,6 +2701,8 @@ def step(self, action: Action) -> Observation: rgbd_images = capture_images_without_context(self._robot) gripper_open_percentage = get_robot_gripper_open_percentage(self._robot) objects_in_view = [] + # Perform object detection. + object_detections_per_camera = self.detect_objects(rgbd_images) obs = _TruncatedSpotObservation( rgbd_images, @@ -2735,7 +2710,8 @@ def step(self, action: Action) -> Observation: set(), set(), self._spot_object, - gripper_open_percentage + gripper_open_percentage, + object_detections_per_camera ) return obs diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py index eef7e53c29..1f86189b71 100644 --- a/predicators/perception/spot_perceiver.py +++ b/predicators/perception/spot_perceiver.py @@ -683,12 +683,40 @@ def reset(self, env_task: EnvironmentTask) -> Task: return Task(state, goal) def step(self, observation: Observation) -> State: + import pdb; pdb.set_trace() self._waiting_for_observation = False self._robot = observation.robot - imgs = observation.rgbd_images - img_names = [v.camera_name for _, v in imgs.items()] - imgs = [v.rgb for _, v in imgs.items()] - # import pdb; pdb.set_trace() + img_objects = observation.rgbd_images # RGBDImage objects + img_names = [v.camera_name for _, v in img_objects.items()] + imgs = [v.rotated_rgb for _, v in img_objects.items()] + import PIL + from PIL import ImageDraw, ImageFont + pil_imgs = [PIL.Image.fromarray(img) for img in imgs] + # Annotate images with detected objects (names + bounding box) + # and camera name. + object_detections_per_camera = observation.object_detections_per_camera + imgs_with_objects_annotated = [] # These are PIL images. + for i, camera_name in enumerate(img_names): + draw = ImageDraw.Draw(pil_imgs[i]) + # Annotate with camera name. + font = utils.get_scaled_default_font(draw, 4) + _ = utils.add_text_to_draw_img(draw, (0, 0), self.camera_name_to_annotation[camera_name], font) + # Annotate with object detections. + detections = object_detections_per_camera[camera_name] + for obj_id, seg_bb in detections: + x0, y0, x1, y1 = seg_bb.bounding_box + draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2) + text = f"{obj_id.language_id}" + font = ImageFont.load_default() + text_mask = font.getmask(text) + text_width, text_height = text_mask.size + text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)] + draw.rectangle(text_bbox, fill='green') + draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font) + + import pdb; pdb.set_trace() + + import PIL from PIL import ImageDraw annotated_pil_imgs = [] diff --git a/predicators/spot_utils/perception/object_detection.py b/predicators/spot_utils/perception/object_detection.py index 94b7581d66..90365f2b02 100644 --- a/predicators/spot_utils/perception/object_detection.py +++ b/predicators/spot_utils/perception/object_detection.py @@ -450,7 +450,7 @@ def _query_detic_sam2( scores[best_idx]) object_id_to_img_detections[obj_id][rgbd.camera_name] = seg_bb - import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() return object_id_to_img_detections diff --git a/predicators/spot_utils/perception/spot_cameras.py b/predicators/spot_utils/perception/spot_cameras.py index 299fd8db0f..be5131690a 100644 --- a/predicators/spot_utils/perception/spot_cameras.py +++ b/predicators/spot_utils/perception/spot_cameras.py @@ -23,12 +23,12 @@ 'right_fisheye_image': 180 } RGB_TO_DEPTH_CAMERAS = { - # "hand_color_image": "hand_depth_in_hand_color_frame", - # "left_fisheye_image": "left_depth_in_visual_frame", - # "right_fisheye_image": "right_depth_in_visual_frame", + "hand_color_image": "hand_depth_in_hand_color_frame", + "left_fisheye_image": "left_depth_in_visual_frame", + "right_fisheye_image": "right_depth_in_visual_frame", "frontleft_fisheye_image": "frontleft_depth_in_visual_frame", - # "frontright_fisheye_image": "frontright_depth_in_visual_frame", - # "back_fisheye_image": "back_depth_in_visual_frame" + "frontright_fisheye_image": "frontright_depth_in_visual_frame", + "back_fisheye_image": "back_depth_in_visual_frame" } # Hack to avoid double image capturing when we want to (1) get object states @@ -125,7 +125,7 @@ def capture_images_without_context( robot: Robot, camera_names: Optional[Collection[str]] = None, quality_percent: int = 100, -) -> Dict[str, RGBDImageWithContext]: +) -> Dict[str, RGBDImage]: """Build an image request and get the responses. If no camera names are provided, all RGB cameras are used. From aec3f863604361410d0342a934a6e215b7b6b1d7 Mon Sep 17 00:00:00 2001 From: Ashay Athalye Date: Sat, 14 Sep 2024 21:00:21 -0400 Subject: [PATCH 3/5] Update annotation text font and background. --- predicators/perception/spot_perceiver.py | 7 +++---- predicators/spot_utils/perception/spot_cameras.py | 10 +++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py index 1f86189b71..6b834521c7 100644 --- a/predicators/perception/spot_perceiver.py +++ b/predicators/perception/spot_perceiver.py @@ -695,7 +695,6 @@ def step(self, observation: Observation) -> State: # Annotate images with detected objects (names + bounding box) # and camera name. object_detections_per_camera = observation.object_detections_per_camera - imgs_with_objects_annotated = [] # These are PIL images. for i, camera_name in enumerate(img_names): draw = ImageDraw.Draw(pil_imgs[i]) # Annotate with camera name. @@ -707,12 +706,12 @@ def step(self, observation: Observation) -> State: x0, y0, x1, y1 = seg_bb.bounding_box draw.rectangle([(x0, y0), (x1, y1)], outline='green', width=2) text = f"{obj_id.language_id}" - font = ImageFont.load_default() + font = utils.get_scaled_default_font(draw, 3) text_mask = font.getmask(text) text_width, text_height = text_mask.size - text_bbox = [(x0, y0 - text_height - 2), (x0 + text_width + 2, y0)] + text_bbox = [(x0, y0 - 1.5*text_height), (x0 + text_width + 1, y0)] draw.rectangle(text_bbox, fill='green') - draw.text((x0 + 1, y0 - text_height - 1), text, fill='white', font=font) + draw.text((x0 + 1, y0 - 1.5*text_height), text, fill='white', font=font) import pdb; pdb.set_trace() diff --git a/predicators/spot_utils/perception/spot_cameras.py b/predicators/spot_utils/perception/spot_cameras.py index be5131690a..15349535b8 100644 --- a/predicators/spot_utils/perception/spot_cameras.py +++ b/predicators/spot_utils/perception/spot_cameras.py @@ -23,12 +23,12 @@ 'right_fisheye_image': 180 } RGB_TO_DEPTH_CAMERAS = { - "hand_color_image": "hand_depth_in_hand_color_frame", - "left_fisheye_image": "left_depth_in_visual_frame", - "right_fisheye_image": "right_depth_in_visual_frame", + # "hand_color_image": "hand_depth_in_hand_color_frame", + # "left_fisheye_image": "left_depth_in_visual_frame", + # "right_fisheye_image": "right_depth_in_visual_frame", "frontleft_fisheye_image": "frontleft_depth_in_visual_frame", - "frontright_fisheye_image": "frontright_depth_in_visual_frame", - "back_fisheye_image": "back_depth_in_visual_frame" + # "frontright_fisheye_image": "frontright_depth_in_visual_frame", + # "back_fisheye_image": "back_depth_in_visual_frame" } # Hack to avoid double image capturing when we want to (1) get object states From ac9fc2f69b6ff01832e6c202feebfd359903f211 Mon Sep 17 00:00:00 2001 From: Ashay Athalye Date: Sat, 14 Sep 2024 21:07:05 -0400 Subject: [PATCH 4/5] Run autoformatter. --- predicators/approaches/bilevel_planning_approach.py | 1 - predicators/envs/spot_env.py | 11 ++++++----- predicators/ground_truth_models/spot_env/options.py | 8 ++++---- predicators/perception/spot_perceiver.py | 11 ++++------- predicators/spot_utils/perception/object_detection.py | 2 +- predicators/spot_utils/perception/spot_cameras.py | 6 +++--- predicators/spot_utils/spot_localization.py | 1 - 7 files changed, 18 insertions(+), 22 deletions(-) diff --git a/predicators/approaches/bilevel_planning_approach.py b/predicators/approaches/bilevel_planning_approach.py index 3c02e7dbc5..dbf9eb8a33 100644 --- a/predicators/approaches/bilevel_planning_approach.py +++ b/predicators/approaches/bilevel_planning_approach.py @@ -61,7 +61,6 @@ def _solve(self, task: Task, timeout: int) -> Callable[[State], Action]: utils.abstract(task.init, preds, self._vlm) import pdb; pdb.set_trace() # utils.abstract(task.init, preds, self._vlm) - # Run task planning only and then greedily sample and execute in the # policy. if self._plan_without_sim: diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py index 3700100e0c..f7a6798a53 100644 --- a/predicators/envs/spot_env.py +++ b/predicators/envs/spot_env.py @@ -24,13 +24,14 @@ from predicators.settings import CFG from predicators.spot_utils.perception.object_detection import \ AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \ - LanguageObjectDetectionID, ObjectDetectionID, detect_objects, \ - visualize_all_artifacts, _query_detic_sam2 + LanguageObjectDetectionID, ObjectDetectionID, _query_detic_sam2, \ + detect_objects, visualize_all_artifacts from predicators.spot_utils.perception.object_specific_grasp_selection import \ brush_prompt, bucket_prompt, football_prompt, train_toy_prompt -from predicators.spot_utils.perception.perception_structs import \ - RGBDImageWithContext, RGBDImage, SegmentedBoundingBox -from predicators.spot_utils.perception.spot_cameras import capture_images, capture_images_without_context +from predicators.spot_utils.perception.perception_structs import RGBDImage, \ + RGBDImageWithContext, SegmentedBoundingBox +from predicators.spot_utils.perception.spot_cameras import capture_images, \ + capture_images_without_context from predicators.spot_utils.skills.spot_find_objects import \ init_search_for_objects from predicators.spot_utils.skills.spot_hand_move import \ diff --git a/predicators/ground_truth_models/spot_env/options.py b/predicators/ground_truth_models/spot_env/options.py index 4e9b7293bf..59749ab03f 100644 --- a/predicators/ground_truth_models/spot_env/options.py +++ b/predicators/ground_truth_models/spot_env/options.py @@ -1,22 +1,22 @@ """Ground-truth options for Spot environments.""" +import logging import time from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple -import logging import numpy as np import pbrspot from bosdyn.client import math_helpers -from bosdyn.client.sdk import Robot from bosdyn.client.lease import LeaseClient +from bosdyn.client.sdk import Robot from gym.spaces import Box from predicators import utils from predicators.envs import get_or_create_env from predicators.envs.spot_env import HANDEMPTY_GRIPPER_THRESHOLD, \ SpotRearrangementEnv, _get_sweeping_surface_for_container, \ - get_detection_id_for_object, get_robot, get_robot_only, \ - get_robot_gripper_open_percentage, get_simulated_object, \ + get_detection_id_for_object, get_robot, \ + get_robot_gripper_open_percentage, get_robot_only, get_simulated_object, \ get_simulated_robot from predicators.ground_truth_models import GroundTruthOptionFactory from predicators.settings import CFG diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py index 6b834521c7..b9e56d7199 100644 --- a/predicators/perception/spot_perceiver.py +++ b/predicators/perception/spot_perceiver.py @@ -17,6 +17,10 @@ _PartialPerceptionState, _SpotObservation, in_general_view_classifier from predicators.perception.base_perceiver import BasePerceiver from predicators.settings import CFG +from predicators.spot_utils.perception.object_detection import \ + AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \ + LanguageObjectDetectionID, ObjectDetectionID, _query_detic_sam2, \ + detect_objects, visualize_all_artifacts from predicators.spot_utils.utils import _container_type, \ _immovable_object_type, _movable_object_type, _robot_type, \ get_allowed_map_regions, load_spot_metadata, object_to_top_down_geom @@ -24,11 +28,6 @@ GoalDescription, GroundAtom, Object, Observation, Predicate, \ SpotActionExtraInfo, State, Task, Video -from predicators.spot_utils.perception.object_detection import \ - AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \ - LanguageObjectDetectionID, ObjectDetectionID, detect_objects, \ - visualize_all_artifacts, _query_detic_sam2 - class SpotPerceiver(BasePerceiver): """A perceiver specific to spot envs.""" @@ -714,8 +713,6 @@ def step(self, observation: Observation) -> State: draw.text((x0 + 1, y0 - 1.5*text_height), text, fill='white', font=font) import pdb; pdb.set_trace() - - import PIL from PIL import ImageDraw annotated_pil_imgs = [] diff --git a/predicators/spot_utils/perception/object_detection.py b/predicators/spot_utils/perception/object_detection.py index 90365f2b02..6b627a90ef 100644 --- a/predicators/spot_utils/perception/object_detection.py +++ b/predicators/spot_utils/perception/object_detection.py @@ -41,7 +41,7 @@ from predicators.spot_utils.perception.perception_structs import \ AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \ LanguageObjectDetectionID, ObjectDetectionID, PythonicObjectDetectionID, \ - RGBDImageWithContext, RGBDImage, SegmentedBoundingBox + RGBDImage, RGBDImageWithContext, SegmentedBoundingBox from predicators.spot_utils.utils import get_april_tag_transform, \ get_graph_nav_dir from predicators.utils import rotate_point_in_image diff --git a/predicators/spot_utils/perception/spot_cameras.py b/predicators/spot_utils/perception/spot_cameras.py index 15349535b8..036dc089ec 100644 --- a/predicators/spot_utils/perception/spot_cameras.py +++ b/predicators/spot_utils/perception/spot_cameras.py @@ -2,16 +2,16 @@ from typing import Collection, Dict, Optional, Type import cv2 -from scipy import ndimage import numpy as np from bosdyn.api import image_pb2 from bosdyn.client.frame_helpers import BODY_FRAME_NAME, get_a_tform_b from bosdyn.client.image import ImageClient, build_image_request from bosdyn.client.sdk import Robot from numpy.typing import NDArray +from scipy import ndimage -from predicators.spot_utils.perception.perception_structs import \ - RGBDImageWithContext, RGBDImage +from predicators.spot_utils.perception.perception_structs import RGBDImage, \ + RGBDImageWithContext from predicators.spot_utils.spot_localization import SpotLocalizer ROTATION_ANGLE = { diff --git a/predicators/spot_utils/spot_localization.py b/predicators/spot_utils/spot_localization.py index 27283b248e..145178cd7a 100644 --- a/predicators/spot_utils/spot_localization.py +++ b/predicators/spot_utils/spot_localization.py @@ -74,7 +74,6 @@ def __init__(self, robot: Robot, upload_path: Path, # raise LocalizationFailure(msg) # logging.warning("Localization failed once, retrying.") # time.sleep(LOCALIZATION_RETRY_WAIT_TIME) - # # Run localize once to start. # self.localize() From 5a461cc7fb24cfd9a09940453dc6e30969c04bbd Mon Sep 17 00:00:00 2001 From: Ashay Athalye Date: Mon, 16 Sep 2024 14:30:31 -0400 Subject: [PATCH 5/5] Fix indent. --- predicators/envs/spot_env.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py index f7a6798a53..374c0114a6 100644 --- a/predicators/envs/spot_env.py +++ b/predicators/envs/spot_env.py @@ -190,15 +190,15 @@ def get_robot( @functools.lru_cache(maxsize=None) def get_robot_only() -> Tuple[Optional[Robot], Optional[LeaseClient]]: - hostname = CFG.spot_robot_ip - sdk = create_standard_sdk("PredicatorsClient-") - robot = sdk.create_robot(hostname) - robot.authenticate("user", "bbbdddaaaiii") - verify_estop(robot) - lease_client = robot.ensure_client(LeaseClient.default_service_name) - lease_client.take() - lease_keepalive = LeaseKeepAlive(lease_client, must_acquire=True, return_at_exit=True) - return robot, lease_client + hostname = CFG.spot_robot_ip + sdk = create_standard_sdk("PredicatorsClient-") + robot = sdk.create_robot(hostname) + robot.authenticate("user", "bbbdddaaaiii") + verify_estop(robot) + lease_client = robot.ensure_client(LeaseClient.default_service_name) + lease_client.take() + lease_keepalive = LeaseKeepAlive(lease_client, must_acquire=True, return_at_exit=True) + return robot, lease_client @functools.lru_cache(maxsize=None)