diff --git a/hand_image_0.png b/hand_image_0.png new file mode 100644 index 0000000000..7557ab8050 Binary files /dev/null and b/hand_image_0.png differ diff --git a/image_0.png b/image_0.png new file mode 100644 index 0000000000..7fd439e1da Binary files /dev/null and b/image_0.png differ diff --git a/image_1.png b/image_1.png new file mode 100644 index 0000000000..bdabc74791 Binary files /dev/null and b/image_1.png differ diff --git a/image_2.png b/image_2.png new file mode 100644 index 0000000000..ad118a7511 Binary files /dev/null and b/image_2.png differ diff --git a/image_3.png b/image_3.png new file mode 100644 index 0000000000..2d6dd6a899 Binary files /dev/null and b/image_3.png differ diff --git a/image_4.png b/image_4.png new file mode 100644 index 0000000000..b3e3b8b761 Binary files /dev/null and b/image_4.png differ diff --git a/image_5.png b/image_5.png new file mode 100644 index 0000000000..9b227df31d Binary files /dev/null and b/image_5.png differ diff --git a/image_i.png b/image_i.png new file mode 100644 index 0000000000..9b227df31d Binary files /dev/null and b/image_i.png differ diff --git a/my_image.png b/my_image.png new file mode 100644 index 0000000000..14f3e7b4aa Binary files /dev/null and b/my_image.png differ diff --git a/my_image2.png b/my_image2.png new file mode 100644 index 0000000000..6417b933cc Binary files /dev/null and b/my_image2.png differ diff --git a/predicators/cogman.py b/predicators/cogman.py index 4401f60c98..f0d00e11ee 100644 --- a/predicators/cogman.py +++ b/predicators/cogman.py @@ -80,6 +80,8 @@ def step(self, observation: Observation) -> Optional[Action]: logging.info("[CogMan] Termination triggered.") return None # Check if we should replan. + self._exec_monitor.perceiver = self._perceiver + self._exec_monitor.env_task = self._current_env_task if self._exec_monitor.step(state): logging.info("[CogMan] Replanning triggered.") assert self._current_goal is not None diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py index 9158651422..9938306f67 100644 --- a/predicators/envs/spot_env.py +++ b/predicators/envs/spot_env.py @@ -1641,22 +1641,22 @@ def _get_sweeping_surface_for_container(container: Object, # NOTE: How to express the belief-space 3-value predicate using state-space binary predicate is tricky, because the system doesn't support NOT (negation) or OR (disjunction). Thus, it couldn't do "NotKnownAsTrue OR NotKnownAsFalse". - # E.g,. _ContainingWaterKnownAsTrue won't work. - _ContainingWaterKnown = VLMPredicate( - "ContainingWaterKnown", [_container_type], - prompt="[Answer: yes/no only] This predicate is true (answer [yes]) if you know whether the container contains water or not. If you don't know, answer [no]." + # E.g,. _ContainingFoodKnownAsTrue won't work. + _ContainingFoodKnown = VLMPredicate( + "ContainingFoodKnown", [_container_type], + prompt="[Answer: yes/no only] This predicate is true (answer [yes]) if you know whether the steel bowl has anything in it or not. If you don't know, answer [no]." ) - _ContainingWaterUnknown = VLMPredicate( - "ContainingWaterUnknown", [_container_type], - prompt="[Answer: yes/no only] This predicate is true (answer [yes]) if you do not know whether the container contains water or not. If you know, answer [no]." + _ContainingFoodUnknown = VLMPredicate( + "ContainingFoodUnknown", [_container_type], + prompt="[Answer: yes/no only] This predicate is true (answer [yes]) if you do not know whether the steel bowl has anything in it or not. If you know, answer [no]." ) - _ContainingWater = VLMPredicate( - "ContainingWater", [_container_type], - prompt="[Answer: yes/no only] This predicate is true (answer [yes]) if the container has water in it. If you know it doesn't have water, answer [no]." + _ContainingFood = VLMPredicate( + "ContainingFood", [_container_type], + prompt="[Answer: yes/no only] This predicate is true (answer [yes]) if the steel bowl has anything in it. If you know it doesn't have anything, answer [no]." ) - _NotContainingWater = VLMPredicate( - "NotContainingWater", [_container_type], - prompt="[Answer: yes/no only] This predicate is true (answer [yes]) if the container does not have water in it. If it has water, answer [no]." + _NotContainingFood = VLMPredicate( + "NotContainingFood", [_container_type], + prompt="[Answer: yes/no only] This predicate is true (answer [yes]) if the steel bowl does not have anything in it. If it has anything, answer [no]." ) _InHandViewFromTop = VLMPredicate( @@ -1667,10 +1667,10 @@ def _get_sweeping_surface_for_container(container: Object, _ALL_PREDICATES.update({ _DoorOpenKnownTrue, _DoorOpenKnownFalse, - _ContainingWaterKnown, - _ContainingWaterUnknown, - _ContainingWater, - _NotContainingWater, + _ContainingFoodKnown, + _ContainingFoodUnknown, + _ContainingFood, + _NotContainingFood, _InHandViewFromTop # TODO check why missing }) @@ -1745,18 +1745,18 @@ def _create_operators() -> Iterator[STRIPSOperator]: LiftedAtom(_InHandViewFromTop, [robot, cup]), # TODO comment LiftedAtom(_HandEmpty, [robot]), LiftedAtom(_NotHolding, [robot, cup]), - LiftedAtom(_ContainingWaterUnknown, [cup]), + LiftedAtom(_ContainingFoodUnknown, [cup]), } # NOTE: Determinized effect: both Containing and NotContaining # The belief state will be updated after execution add_effs = { - LiftedAtom(_ContainingWaterKnown, [cup]), - LiftedAtom(_ContainingWater, [cup]), + LiftedAtom(_ContainingFoodKnown, [cup]), + LiftedAtom(_ContainingFood, [cup]), # TODO add not containing water - LiftedAtom(_NotContainingWater, [cup]) + LiftedAtom(_NotContainingFood, [cup]) } del_effs = { - LiftedAtom(_ContainingWaterUnknown, [cup]) + LiftedAtom(_ContainingFoodUnknown, [cup]) } # TODO check ignore effs ignore_effs = {_Reachable, _InHandViewFromTop, _InView, _RobotReadyForSweeping} @@ -3472,7 +3472,8 @@ def _detection_id_to_obj(self) -> Dict[ObjectDetectionID, Object]: # NOTE: we view cup as container; cup = Object("cup", _container_type) # cup_detection = LanguageObjectDetectionID("green bowl/greenish bowl") - cup_detection = LanguageObjectDetectionID("orange cup/orange cylinder/orange-ish mug") + # cup_detection = LanguageObjectDetectionID("steel bowl/metal bowl/shiny-metallic bowl") + cup_detection = LanguageObjectDetectionID("blue cup/blue cylinder/blue-ish mug") # TODO test # cup_detection = LanguageObjectDetectionID("spam box/spam container/spam-ish box") # cup_detection = LanguageObjectDetectionID("yellow apple/yellowish apple") diff --git a/predicators/execution_monitoring/expected_atoms_monitor.py b/predicators/execution_monitoring/expected_atoms_monitor.py index 884483b15e..246d29b3ce 100644 --- a/predicators/execution_monitoring/expected_atoms_monitor.py +++ b/predicators/execution_monitoring/expected_atoms_monitor.py @@ -31,6 +31,11 @@ def step(self, state: State) -> bool: # If the expected atoms are a subset of the current atoms, then # we don't have to replan. unsat_atoms = {a for a in next_expected_atoms if not a.holds(state)} + # Check goal + assert self.perceiver is not None and self.env_task is not None + goal = self.perceiver._create_goal(state, self.env_task.goal_description) + import ipdb; ipdb.set_trace() + # if not unsat_atoms: return False logging.info( diff --git a/predicators/ground_truth_models/spot_env/nsrts.py b/predicators/ground_truth_models/spot_env/nsrts.py index 982e639be5..25cb6a02b9 100644 --- a/predicators/ground_truth_models/spot_env/nsrts.py +++ b/predicators/ground_truth_models/spot_env/nsrts.py @@ -98,8 +98,8 @@ def _move_to_hand_view_object_from_top_sampler(state: State, goal: Set[GroundAto # Parameters are relative distance, dyaw (to the object you're moving to). del goal - min_dist = 1.2 - max_dist = 1.8 + min_dist = 0.9 + max_dist = 1.2 robot_obj = objs[0] obj_to_nav_to = objs[1] diff --git a/predicators/ground_truth_models/spot_env/options.py b/predicators/ground_truth_models/spot_env/options.py index 4aab7e5f3d..37e6428c1e 100644 --- a/predicators/ground_truth_models/spot_env/options.py +++ b/predicators/ground_truth_models/spot_env/options.py @@ -38,7 +38,8 @@ from predicators.spot_utils.utils import DEFAULT_HAND_DROP_OBJECT_POSE, \ DEFAULT_HAND_LOOK_STRAIGHT_DOWN_POSE, DEFAULT_HAND_POST_DUMP_POSE, \ DEFAULT_HAND_PRE_DUMP_LIFT_POSE, DEFAULT_HAND_PRE_DUMP_POSE, \ - get_relative_se2_from_se3, load_spot_metadata, object_to_top_down_geom + get_relative_se2_from_se3, load_spot_metadata, object_to_top_down_geom, \ + DEFAULT_HAND_LOOK_FROM_TOP from predicators.structs import Action, Array, Object, ParameterizedOption, \ Predicate, SpotActionExtraInfo, State, Type @@ -531,7 +532,9 @@ def _move_to_hand_view_object_from_above_policy(state: State, memory: Dict, yaw_param_idx = 1 robot_obj_idx = 0 target_obj_idx = 1 - do_gaze = True + do_gaze = False + robot, localizer, _ = get_robot() + move_hand_to_relative_pose(robot, DEFAULT_HAND_LOOK_FROM_TOP) return _move_to_target_policy(name, distance_param_idx, yaw_param_idx, robot_obj_idx, target_obj_idx, do_gaze, state, memory, objects, params) diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py index a9e5ffc70f..db98b07ac1 100644 --- a/predicators/perception/spot_perceiver.py +++ b/predicators/perception/spot_perceiver.py @@ -3,7 +3,7 @@ import logging import time from pathlib import Path -from typing import Dict, List, Optional, Set +from typing import Dict, List, Optional, Set, Collection import imageio.v2 as iio import numpy as np @@ -14,7 +14,7 @@ from predicators.envs import BaseEnv, get_or_create_env from predicators.envs.spot_env import HANDEMPTY_GRIPPER_THRESHOLD, \ SpotCubeEnv, SpotRearrangementEnv, _drafting_table_type, \ - _PartialPerceptionState, _SpotObservation, in_general_view_classifier + _PartialPerceptionState, _SpotObservation, in_general_view_classifier, _ALL_TYPES from predicators.perception.base_perceiver import BasePerceiver from predicators.settings import CFG from predicators.spot_utils.utils import _container_type, \ @@ -23,6 +23,7 @@ from predicators.structs import Action, DefaultState, EnvironmentTask, \ GoalDescription, GroundAtom, Object, Observation, Predicate, \ SpotActionExtraInfo, State, Task, Video +from predicators.spot_utils.perception.object_perception import vlm class SpotPerceiver(BasePerceiver): @@ -311,7 +312,6 @@ def _create_state(self) -> State: def _create_goal(self, state: State, goal_description: GoalDescription) -> Set[GroundAtom]: - del state # not used # Unfortunate hack to deal with the fact that the state is actually # not yet set. Hopefully one day other cleanups will enable cleaning. assert self._curr_env is not None @@ -550,30 +550,34 @@ def _create_goal(self, state: State, if goal_description == "know container not as empty": # container = Object("container", _container_type) cup = Object("cup", _container_type) - ContainingWaterKnown = pred_name_to_pred["ContainingWaterKnown"] - ContainingWater = pred_name_to_pred["ContainingWater"] + ContainingFoodKnown = pred_name_to_pred["ContainingFoodKnown"] + ContainingFood = pred_name_to_pred["ContainingFood"] return { - GroundAtom(ContainingWaterKnown, [cup]), - GroundAtom(ContainingWater, [cup]), + GroundAtom(ContainingFoodKnown, [cup]), + GroundAtom(ContainingFood, [cup]), } if goal_description == "place empty cup into the box": cup = Object("cup", _container_type) plastic_bin = Object("plastic_bin", _container_type) - ContainingWaterKnown = pred_name_to_pred["ContainingWaterKnown"] - NotContainingWater = pred_name_to_pred["NotContainingWater"] + ContainingFoodKnown = pred_name_to_pred["ContainingFoodKnown"] + NotContainingFood = pred_name_to_pred["NotContainingFood"] Inside = pred_name_to_pred["Inside"] - return { - #GroundAtom(ContainingWaterKnown, [cup]), - #GroundAtom(NotContainingWater, [cup]), - GroundAtom(Inside, [cup, plastic_bin]), - } + if state.data == {}: + return { + #GroundAtom(ContainingFoodKnown, [cup]), + #GroundAtom(NotContainingFood, [cup]), + GroundAtom(Inside, [cup, plastic_bin]), + } + object_name_to_object = {} + self._parse_vlm_goal_from_state(state, goal_description, object_name_to_object) + import ipdb; ipdb.set_trace() if goal_description == "know container as empty": cup = Object("cup", _container_type) - ContainingWaterKnown = pred_name_to_pred["ContainingWaterKnown"] - NotContainingWater = pred_name_to_pred["NotContainingWater"] + ContainingFoodKnown = pred_name_to_pred["ContainingFoodKnown"] + NotContainingFood = pred_name_to_pred["NotContainingFood"] return { - GroundAtom(ContainingWaterKnown, [cup]), - GroundAtom(NotContainingWater, [cup]), + GroundAtom(ContainingFoodKnown, [cup]), + GroundAtom(NotContainingFood, [cup]), } if goal_description == "put the cup into the plastic bin on floor": cup = Object("cup", _container_type) @@ -695,3 +699,45 @@ def render_mental_images(self, observation: Observation, logging.info(f"Wrote out to {outfile}") plt.close() return [img] + + def _get_language_goal_prompt_prefix(self, + object_names: Collection[str]) -> str: + # pylint:disable=line-too-long + available_predicates = ", ".join([p for p in sorted([pred.pretty_str()[1] for pred in self._curr_env.goal_predicates])]) + available_object_types = ", ".join(sorted([t.name for t in _ALL_TYPES])) + # We could extract the object names, but this is simpler. + prompt = f"""# The available predicates are: {available_predicates} +# The available object types are: {available_object_types} +# Use the available predicates and object types to convert natural language goals into JSON goals. + +# I want a sandwich with a patty, cheese, and lettuce, and get ready to give me a glass of milk. +{{"Holding": [["robot", "milk"]], "On": [["bread0", "board"], ["bread1", "lettuce0"], ["lettuce0", "cheese0"], ["cheese0", "patty0"], ["patty0", "bread0"]]}} +""" + return prompt + + def _parse_vlm_goal_from_state( + self, state: State, language_goal: str, + id_to_obj: Dict[str, Object]) -> Set[GroundAtom]: + """Helper for parsing language-based goals from JSON task specs.""" + object_names = set(id_to_obj) + prompt_prefix = self._get_language_goal_prompt_prefix(object_names) + prompt = prompt_prefix + f"\n# {language_goal}" + import ipdb; ipdb.set_trace() + image_list = [ + PIL.Image.fromarray(v.rotated_rgb) for _, v in rgbds.items() + ] + responses = vlm.sample_completions( + prompt=prompt, + imgs=image_list, + temperature=0.1, + seed=int(time.time()), + num_completions=1, + ) + response = responses[0] + import ipdb; ipdb.set_trace() + # Currently assumes that the LLM is perfect. In the future, will need + # to handle various errors and perhaps query the LLM for multiple + # responses until we find one that can be parsed. + goal_spec = json.loads(response) + return self._curr_env._parse_goal_from_json(goal_spec, id_to_obj) + diff --git a/predicators/spot_utils/perception/object_perception.py b/predicators/spot_utils/perception/object_perception.py index 448aa03b5b..fc9f4e1fe1 100644 --- a/predicators/spot_utils/perception/object_perception.py +++ b/predicators/spot_utils/perception/object_perception.py @@ -216,6 +216,7 @@ def query_vlm(full_prompt, image_list): votes = [result[i] for result in results] final_results.append(votes.count(True) > votes.count(False)) + import ipdb; ipdb.set_trace() return final_results diff --git a/predicators/spot_utils/utils.py b/predicators/spot_utils/utils.py index dfa9381700..940da95178 100644 --- a/predicators/spot_utils/utils.py +++ b/predicators/spot_utils/utils.py @@ -47,6 +47,11 @@ DEFAULT_HAND_POST_DUMP_POSE = math_helpers.SE3Pose( x=0.80, y=0.0, z=0.25, rot=math_helpers.Quat.from_pitch(np.pi / 2)) DEFAULT_SIM_ROBOT_Z_OFFSET = 0.6 +DEFAULT_HAND_LOOK_FROM_TOP = math_helpers.SE3Pose(x=0.80, + y=0.0, + z=0.75, + rot=math_helpers.Quat.from_pitch( + 5*np.pi / 12)) # Spot-specific types.