Get sweeping demo mostly working.

bdaiinstitute · Sep 19, 2024 · 29242a8 · 29242a8
1 parent ba8d070
commit 29242a8
Show file tree

Hide file tree

Showing 8 changed files with 120 additions and 50 deletions.
diff --git a/predicators/approaches/bilevel_planning_approach.py b/predicators/approaches/bilevel_planning_approach.py
@@ -70,9 +70,12 @@ def _solve(self, task: Task, timeout: int) -> Callable[[State], Action]:
             self._last_atoms_seq = atoms_seq
             policy = utils.nsrt_plan_to_greedy_policy(nsrt_plan, task.goal,
                                                       self._rng)
-            logging.debug("Current Task Plan:")
-            for act in nsrt_plan:
-                logging.debug(act)
+            # logging.debug("Current Task Plan:")
+            # for act in nsrt_plan:
+            #     logging.debug(act)
+            logging.info("Current Task Plan:")
+            for n in nsrt_plan:
+                logging.info(f"{n.name}, {n.objects}")
 
         # Run full bilevel planning.
         else:

diff --git a/...icators/datasets/vlm_input_data_prompts/atom_labelling/img_option_diffs_label_history.txt b/...icators/datasets/vlm_input_data_prompts/atom_labelling/img_option_diffs_label_history.txt
@@ -1,3 +1,23 @@
-You are a vision system for a robot. You are provided with two images corresponding to the states before and after a particular skill is executed. You are given a list of predicates below, and you are given the values of these predicates in the image before the skill is executed. Your job is to output the values of the following predicates in the image after the skill is executed. Pay careful attention to the visual changes between the two images to figure out which predicates change and which predicates do not change. For the predicates that change, list these separately at the end of your response. Note that in some scenes, there might be no changes. First, output a description of what changes you expect to happen based on the skill that was just run, explicitly noting the skill that was run. Second, output a description of what visual changes you see happen between the before and after images, looking specifically at the objects involved in the skill's arguments, noting what objects these are. Next, output each predicate value in the after image as a bulleted list with each predicate and value on a different line. For each predicate value, provide an explanation as to why you labelled this predicate as having this particular value. Use the format: <predicate>: <truth_value>. <explanation>.
+You are a vision system for a robot. You are provided with two images corresponding to the states before and after a particular skill is executed. You are given a list of predicates below, and you are given the values of these predicates in the image before the skill is executed. Your job is to output the values of the following predicates in the image after the skill is executed. Pay careful attention to the visual changes between the two images to figure out which predicates change and which predicates do not change. Note that some or all of the predicates don't necessary have to change. First, output a description of what changes you expect to happen based on the skill that was just run, explicitly noting the skill that was run. Second, output a description of what visual changes you see happen between the before and after images, looking specifically at the objects involved in the skill's arguments, noting what objects these are. Next, output each predicate value in the after image as a bulleted list (use '*' for the bullets) with each predicate and value on a different line. Ensure there is a period ('.') after the truth value of the predicate. For each predicate value, provide an explanation as to why you labelled this predicate as having this particular value. Use the format: `* <predicate>: <truth_value>. <explanation>`. When labeling the value of a predicate, if you don't see the objects involved in that predicate, retain its truth value from the previous timestep.
+
+The current task plan involves the following actions in sequence:
+Pick1, (robot:robot, dustpan:dustpan)
+PlaceNextTo, (robot:robot, dustpan:dustpan, wrappers:wrappers)
+Pick2, (robot:robot, broom:broom)
+Sweep, (robot:robot, broom:broom, wrappers:wrappers, dustpan:dustpan)
+PlaceOnFloor, (robot:robot, broom:broom)
+Pick1, (robot:robot, dustpan:dustpan)
+
+Your response should have three sections. Here is an outline of what your response should look like:
+[START OULTLINE]
+# Expected changes based on the executed skill
+[insert your analysis on the expected changes you will see based on the skill that was executed]
+
+# Visual changes observed between the images
+[insert your analysis on the visual changes observed between the images]
+
+# Predicate values in the after image
+[insert your bulleted list of `* <predicate>: <truth value>. <explanation>`]
+[END OUTLINE]
 
 Predicates:
diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py
@@ -42,6 +42,7 @@
 from predicators.spot_utils.spot_localization import SpotLocalizer
 from predicators.spot_utils.utils import _base_object_type, _container_type, \
     _immovable_object_type, _movable_object_type, _robot_type, \
+    _broom_type, _dustpan_type, _wrappers_type, \
     construct_state_given_pbrspot, get_allowed_map_regions, \
     get_graph_nav_dir, get_robot_gripper_open_percentage, get_spot_home_pose, \
     load_spot_metadata, object_to_top_down_geom, update_pbrspot_given_state, \
@@ -1517,11 +1518,11 @@ def _get_vlm_query_str(pred_name: str, objects: Sequence[Object]) -> str:
 # long = "Is the dustpan oriented such that a single sweeping motion with a broom would move the mess into the dustpan?"
 # long = "Touching(dustpan, mess)"
 _Touching = utils.create_vlm_predicate(
-    "Touching", [_movable_object_type, _movable_object_type],
-    lambda o: _get_vlm_query_str("Touching(dustpan, mess)", o))
+    "Touching", [_dustpan_type, _wrappers_type],
+    lambda o: _get_vlm_query_str("Touching", o))
 _Inside = utils.create_vlm_predicate(
-    "Inside", [_movable_object_type, _movable_object_type],
-    lambda o: _get_vlm_query_str("Inside(mess, dustpan)", o))
+    "Inside", [_wrappers_type, _dustpan_type],
+    lambda o: _get_vlm_query_str("Inside", o))
 
 _ALL_PREDICATES = {
     _NEq, _On, _TopAbove, _NotInsideAnyContainer, _FitsInXY,
@@ -2487,8 +2488,8 @@ def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]:
             # Object("chair", _movable_object_type),
             # Object("bowl", _movable_object_type),
             # Object("table", _movable_object_type),
-            Object("wrappers", _movable_object_type),
-            Object("dustpan", _movable_object_type),
+            Object("wrappers", _wrappers_type),
+            Object("dustpan", _dustpan_type),
         }
         for o in objects:
             detection_id = LanguageObjectDetectionID(o.name)
@@ -2536,7 +2537,7 @@ def _create_operators(self) -> Iterator[STRIPSOperator]:
         ##########################################3
         # Pick(robot, dustpan)
         robot = Variable("?robot", _robot_type)
-        dustpan = Variable("?dustpan", _movable_object_type)
+        dustpan = Variable("?dustpan", _dustpan_type)
         parameters = [robot, dustpan]
         preconds: Set[LiftedAtom] = {
             LiftedAtom(_HandEmpty, [robot]),
@@ -2553,8 +2554,8 @@ def _create_operators(self) -> Iterator[STRIPSOperator]:
 
         # Place(robot, dustpan, mess)
         robot = Variable("?robot", _robot_type)
-        dustpan = Variable("?dustpan", _movable_object_type)
-        mess = Variable("?mess", _movable_object_type)
+        dustpan = Variable("?dustpan", _dustpan_type)
+        mess = Variable("?mess", _wrappers_type)
         parameters = [robot, dustpan, mess]
         preconds: Set[LiftedAtom] = {LiftedAtom(_Holding, [robot, dustpan])}
         add_effs: Set[LiftedAtom] = {
@@ -2564,12 +2565,12 @@ def _create_operators(self) -> Iterator[STRIPSOperator]:
         }
         del_effs: Set[LiftedAtom] = {LiftedAtom(_Holding, [robot, dustpan])}
         ignore_effs: Set[LiftedAtom] = set()
-        yield STRIPSOperator("Place1", parameters, preconds, add_effs, del_effs,
+        yield STRIPSOperator("PlaceNextTo", parameters, preconds, add_effs, del_effs,
                              ignore_effs)
 
         # Pick(robot, broom)
         robot = Variable("?robot", _robot_type)
-        broom = Variable("?broom", _movable_object_type)
+        broom = Variable("?broom", _broom_type)
         parameters = [robot, broom]
         preconds: Set[LiftedAtom] = {
             LiftedAtom(_HandEmpty, [robot]),
@@ -2586,9 +2587,9 @@ def _create_operators(self) -> Iterator[STRIPSOperator]:
 
         # Sweep(robot, broom, mess, dustpan)
         robot = Variable("?robot", _robot_type)
-        broom = Variable("?broom", _movable_object_type)
-        mess = Variable("?mess", _movable_object_type)
-        dustpan = Variable("?dustpan", _movable_object_type)
+        broom = Variable("?broom", _broom_type)
+        mess = Variable("?mess", _wrappers_type)
+        dustpan = Variable("?dustpan", _dustpan_type)
         parameters = [robot, broom, mess, dustpan]
         preconds: Set[LiftedAtom] = {
             LiftedAtom(_Holding, [robot, broom]),
@@ -2603,7 +2604,7 @@ def _create_operators(self) -> Iterator[STRIPSOperator]:
 
         # Place(robot, broom)
         robot = Variable("?robot", _robot_type)
-        broom = Variable("?broom", _movable_object_type)
+        broom = Variable("?broom", _broom_type)
         parameters = [robot, broom]
         preconds: Set[LiftedAtom] = {LiftedAtom(_Holding, [robot, broom])}
         add_effs: Set[LiftedAtom] = {
@@ -2612,7 +2613,7 @@ def _create_operators(self) -> Iterator[STRIPSOperator]:
         }
         del_effs: Set[LiftedAtom] = {LiftedAtom(_Holding, [robot, broom])}
         ignore_effs: Set[LiftedAtom] = set()
-        yield STRIPSOperator("Place2", parameters, preconds, add_effs, del_effs,
+        yield STRIPSOperator("PlaceOnFloor", parameters, preconds, add_effs, del_effs,
                              ignore_effs)
 
     # def _generate_train_tasks(self) -> List[EnvironmentTask]:
@@ -2645,7 +2646,7 @@ def __init__(self, use_gui: bool = True) -> None:
         # Create constant objects.
         self._spot_object = Object("robot", _robot_type)
         op_to_name = {o.name: o for o in self._create_operators()}
-        op_names_to_keep = {"Pick1", "Place1", "Pick2", "Sweep", "Place2"}
+        op_names_to_keep = {"Pick1", "PlaceNextTo", "Pick2", "Sweep", "PlaceOnFloor"}
         self._strips_operators = {op_to_name[o] for o in op_names_to_keep}
         self._train_tasks = []
         self._test_tasks = []

diff --git a/predicators/ground_truth_models/spot_env/nsrts.py b/predicators/ground_truth_models/spot_env/nsrts.py
@@ -322,10 +322,10 @@ def get_nsrts(env_name: str, types: Dict[str, Type],
             "DropNotPlaceableObject": utils.null_sampler,
             "MoveToReadySweep": utils.null_sampler,
             "Pick1": utils.null_sampler,
-            "Place1": utils.null_sampler,
+            "PlaceNextTo": utils.null_sampler,
             "Pick2": utils.null_sampler,
             "Sweep": utils.null_sampler,
-            "Place2": utils.null_sampler
+            "PlaceOnFloor": utils.null_sampler
         }
 
         # If we're doing proper bilevel planning with a simulator, then

diff --git a/predicators/ground_truth_models/spot_env/options.py b/predicators/ground_truth_models/spot_env/options.py
@@ -958,10 +958,10 @@ def _teleop(robot: Robot, lease_client: LeaseClient):
     "DropNotPlaceableObject": Box(0, 1, (0, )),  # empty
     "MoveToReadySweep": Box(0, 1, (0, )),  # empty
     "Pick1": Box(0, 1, (0, )),  # empty
-    "Place1": Box(0, 1, (0, )),  # empty
+    "PlaceNextTo": Box(0, 1, (0, )),  # empty
     "Pick2": Box(0, 1, (0, )),  # empty
     "Sweep": Box(0, 1, (0, )),  # empty
-    "Place2": Box(0, 1, (0, ))  # empty
+    "PlaceOnFloor": Box(0, 1, (0, ))  # empty
 }
 
 # NOTE: the policies MUST be unique because they output actions with extra info
@@ -986,10 +986,10 @@ def _teleop(robot: Robot, lease_client: LeaseClient):
     "DropNotPlaceableObject": _drop_not_placeable_object_policy,
     "MoveToReadySweep": _move_to_ready_sweep_policy,
     "Pick1": _teleop_policy,
-    "Place1": _teleop_policy,
+    "PlaceNextTo": _teleop_policy,
     "Pick2": _teleop_policy,
     "Sweep": _teleop_policy,
-    "Place2": _teleop_policy
+    "PlaceOnFloor": _teleop_policy
 }
 
 

diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py
@@ -23,6 +23,7 @@
     detect_objects, visualize_all_artifacts
 from predicators.spot_utils.utils import _container_type, \
     _immovable_object_type, _movable_object_type, _robot_type, \
+    _broom_type, _dustpan_type, _wrappers_type, \
     get_allowed_map_regions, load_spot_metadata, object_to_top_down_geom
 from predicators.structs import Action, DefaultState, EnvironmentTask, \
     GoalDescription, GroundAtom, Object, Observation, Predicate, \
@@ -659,8 +660,8 @@ def _create_goal(self, state: State,
             return goal
         if goal_description == "put the mess in the dustpan":
             robot = Object("robot", _robot_type)
-            dustpan = Object("dustpan", _movable_object_type)
-            wrappers = Object("wrappers", _movable_object_type)
+            dustpan = Object("dustpan", _dustpan_type)
+            wrappers = Object("wrappers", _wrappers_type)
             goal = {
                 GroundAtom(Inside, [wrappers, dustpan]),
                 GroundAtom(Holding, [robot, dustpan])
@@ -809,8 +810,9 @@ def _create_state(self) -> State:
         # table = Object("table", _movable_object_type)
         # cup = Object("cup", _movable_object_type)
         # pan = Object("pan", _container_type)
-        wrappers = Object("wrappers", _movable_object_type)
-        dustpan = Object("dustpan", _movable_object_type)
+        wrappers = Object("wrappers", _wrappers_type)
+        dustpan = Object("dustpan", _dustpan_type)
+        broom = Object("broom", _broom_type)
         # bread = Object("bread", _movable_object_type)
         # toaster = Object("toaster", _immovable_object_type)
         # microwave = Object("microwave", _movable_object_type)
@@ -1000,6 +1002,26 @@ def _create_state(self) -> State:
                 "in_hand_view": 0,
                 "in_view": 1,
                 "is_sweeper": 0
+            },
+            broom: {
+                "x": 0,
+                "y": 0,
+                "z": 0,
+                "qw": 0,
+                "qx": 0,
+                "qy": 0,
+                "qz": 0,
+                "shape": 0,
+                "height": 0,
+                "width" : 0,
+                "length": 0,
+                "object_id": 2,
+                "placeable": 1,
+                "held": 0,
+                "lost": 0,
+                "in_hand_view": 0,
+                "in_view": 1,
+                "is_sweeper": 0
             }
         }
         state_dict = {k: list(v.values()) for k, v in state_dict.items()}

diff --git a/predicators/spot_utils/utils.py b/predicators/spot_utils/utils.py
@@ -87,6 +87,15 @@ class _Spot3DShape(Enum):
 _container_type = Type("container",
                        list(_movable_object_type.feature_names),
                        parent=_movable_object_type)
+_dustpan_type = Type("dustpan",
+                       list(_movable_object_type.feature_names),
+                       parent=_movable_object_type)
+_broom_type = Type("broom",
+                       list(_movable_object_type.feature_names),
+                       parent=_movable_object_type)
+_wrappers_type = Type("wrappers",
+                       list(_movable_object_type.feature_names),
+                       parent=_movable_object_type)
 
 
 def get_collision_geoms_for_nav(state: State) -> List[_Geom2D]:

diff --git a/predicators/utils.py b/predicators/utils.py
@@ -1489,6 +1489,7 @@ def _option_policy(state: State) -> _Option:
             raise OptionExecutionFailure(
                 "Executing the NSRT failed to achieve the necessary atoms.")
         cur_nsrt = nsrt_queue.pop(0)
+        logging.info(f"Running nsrt: {cur_nsrt.name}, {cur_nsrt.objects}")
         cur_option = cur_nsrt.sample_option(state, goal, rng)
         logging.debug(f"Using option {cur_option.name}{cur_option.objects} "
                       "from NSRT plan.")
@@ -2511,13 +2512,13 @@ def get_prompt_for_vlm_state_labelling(
         "/predicators/datasets/vlm_input_data_prompts/atom_labelling/"
     try:
         with open(filepath_prefix +
-                  CFG.grammar_search_vlm_atom_label_prompt_type + ".txt",
+                  prompt_type + ".txt",
                   "r",
                   encoding="utf-8") as f:
             prompt = f.read()
     except FileNotFoundError:
         raise ValueError("Unknown VLM prompting option " +
-                         f"{CFG.grammar_search_vlm_atom_label_prompt_type}")
+                         f"{prompt_type}")
     # The prompt ends with a section for 'Predicates', so list these.
     for atom_str in atoms_list:
         prompt += f"\n{atom_str}"
@@ -2537,7 +2538,8 @@ def get_prompt_for_vlm_state_labelling(
         # and beyond.
         curr_prompt = prompt[:]
         curr_prompt_imgs = [
-            imgs_timestep for imgs_timestep in imgs_history[-1]
+            imgs_history[-2][0],
+            imgs_history[-1][0]
         ]
         if CFG.vlm_include_cropped_images:
             if CFG.env in ["burger", "burger_no_move"]:  # pragma: no cover
@@ -2553,12 +2555,13 @@ def get_prompt_for_vlm_state_labelling(
             curr_prompt += "\n\nPredicate values in the first scene, " \
             "before the skill was executed: \n"
             curr_prompt += label_history[-1]
+        # import pdb; pdb.set_trace()
         return (curr_prompt, curr_prompt_imgs)
     else:
         # NOTE: we rip out only the first image from each trajectory
         # which is fine for most domains, but will be problematic for
         # situations in which there is more than one image per state.
-        return (prompt, [imgs_history[-1][0]])
+        return (prompt, imgs_history[-1])
 
 
 def query_vlm_for_atom_vals(
@@ -2606,7 +2609,6 @@ def query_vlm_for_atom_vals(
     vlm_query_str, imgs = get_prompt_for_vlm_state_labelling(
         CFG.vlm_test_time_atom_label_prompt_type, atom_queries_list,
         label_history, images_history, [], skill_history)
-    import pdb; pdb.set_trace()
     if vlm is None:
         vlm = create_vlm_by_name(CFG.vlm_model_name)  # pragma: no cover.
     vlm_input_imgs = \
@@ -2618,20 +2620,33 @@ def query_vlm_for_atom_vals(
                                         num_completions=1)
     assert len(vlm_output) == 1
     vlm_output_str = vlm_output[0]
-    print(f"VLM output: {vlm_output_str}")
-    all_vlm_responses = vlm_output_str.strip().split("\n")
-    # NOTE: this assumption is likely too brittle; if this is breaking, feel
-    # free to remove/adjust this and change the below parsing loop accordingly!
-    assert len(atom_queries_list) == len(all_vlm_responses)
-    for i, (atom_query, curr_vlm_output_line) in enumerate(
-            zip(atom_queries_list, all_vlm_responses)):
-        assert atom_query + ":" in curr_vlm_output_line
-        assert "." in curr_vlm_output_line
-        period_idx = curr_vlm_output_line.find(".")
-        # value = curr_vlm_output_line[len(atom_query + ":"):period_idx].lower().strip()
-        value = curr_vlm_output_line.split(': ')[-1].strip('.').lower()
-        if value == "true":
-            true_atoms.add(vlm_atoms[i])
+    print(f"VLM output: \n{vlm_output_str}")
+
+    # ALTERNATIVE WAY TO PARSE
+    if len(label_history) > 0:
+        truth_values = re.findall(r'\* (.*): (True|False)', vlm_output_str)
+        import pdb; pdb.set_trace()
+        for i, (atom_query, pred_label) in enumerate(zip(atom_queries_list, truth_values)):
+            pred, label = pred_label
+            assert pred in atom_query
+            label = label.lower()
+            if label == "true":
+                true_atoms.add(vlm_atoms[i])
+    else:
+        # import pdb; pdb.set_trace()
+        all_vlm_responses = vlm_output_str.strip().split("\n")
+        # NOTE: this assumption is likely too brittle; if this is breaking, feel
+        # free to remove/adjust this and change the below parsing loop accordingly!
+        assert len(atom_queries_list) == len(all_vlm_responses)
+        for i, (atom_query, curr_vlm_output_line) in enumerate(
+                zip(atom_queries_list, all_vlm_responses)):
+            assert atom_query + ":" in curr_vlm_output_line
+            assert "." in curr_vlm_output_line
+            period_idx = curr_vlm_output_line.find(".")
+            # value = curr_vlm_output_line[len(atom_query + ":"):period_idx].lower().strip()
+            value = curr_vlm_output_line.split(': ')[-1].strip('.').lower()
+            if value == "true":
+                true_atoms.add(vlm_atoms[i])
 
     # breakpoint()
     # Add the text of the VLM's response to the state, to be used in the future!