bdaiinstitute · lf-zhao · Apr 23, 2024 · Apr 23, 2024 · Apr 23, 2024 · Apr 25, 2024
diff --git a/predicators/args.py b/predicators/args.py
@@ -43,6 +43,7 @@ def create_arg_parser(env_required: bool = True,
     parser.add_argument("--experiment_id", default="", type=str)
     parser.add_argument("--load_experiment_id", default="", type=str)
     parser.add_argument("--log_file", default="", type=str)
+    parser.add_argument("--log_rich", default="true", type=str)
     parser.add_argument("--use_gui", action="store_true")
     parser.add_argument('--debug',
                         action="store_const",

diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py
diff --git a/predicators/ground_truth_models/spot_env/nsrts.py b/predicators/ground_truth_models/spot_env/nsrts.py
@@ -285,10 +285,16 @@ class SpotEnvsGroundTruthNSRTFactory(GroundTruthNSRTFactory):
     @classmethod
     def get_env_names(cls) -> Set[str]:
         return {
-            "spot_cube_env", "spot_soda_floor_env", "spot_soda_table_env",
-            "spot_soda_bucket_env", "spot_soda_chair_env",
-            "spot_main_sweep_env", "spot_ball_and_cup_sticky_table_env",
-            "spot_brush_shelf_env", "lis_spot_block_floor_env"
+            "spot_cube_env",
+            "spot_soda_floor_env",
+            "spot_soda_table_env",
+            "spot_soda_bucket_env",
+            "spot_soda_chair_env",
+            "spot_main_sweep_env",
+            "spot_ball_and_cup_sticky_table_env",
+            "spot_brush_shelf_env",
+            "lis_spot_block_floor_env",
+            "lis_spot_block_bowl_env",
         }
 
     @staticmethod

diff --git a/predicators/ground_truth_models/spot_env/options.py b/predicators/ground_truth_models/spot_env/options.py
@@ -996,6 +996,7 @@ def get_env_names(cls) -> Set[str]:
             "spot_ball_and_cup_sticky_table_env",
             "spot_brush_shelf_env",
             "lis_spot_block_floor_env",
+            "lis_spot_block_bowl_env",
         }
 
     @classmethod

diff --git a/predicators/main.py b/predicators/main.py
@@ -71,8 +71,14 @@ def main() -> None:
     args = utils.parse_args()
     utils.update_config(args)
     str_args = " ".join(sys.argv)
-    # Log to stderr.
-    handlers: List[logging.Handler] = [logging.StreamHandler()]
+    # Log to stderr or use `rich` package for more structured output.
+    handlers: List[logging.Handler] = []
+    if CFG.log_rich:
+        from rich.logging import RichHandler
+        handlers.append(RichHandler())
+    else:
+        handlers.append(logging.StreamHandler())
+
     if CFG.log_file:
         handlers.append(logging.FileHandler(CFG.log_file, mode='w'))
     logging.basicConfig(level=CFG.loglevel,

diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py
@@ -200,6 +200,18 @@ def _update_state_from_observation(self, observation: Observation) -> None:
         for obj in observation.objects_in_view:
             self._lost_objects.discard(obj)
 
+        # NOTE: This is only used when using VLM for predicate evaluation
+        # NOTE: Performance aspect should be considered later
+        if CFG.spot_vlm_eval_predicate:
+            # Add current Spot images to the state if needed
+            self._camera_images = observation.images
+            self._vlm_atom_dict = observation.vlm_atom_dict
+            self._vlm_predicates = observation.vlm_predicates
+        else:
+            self._camera_images = None
+            self._vlm_atom_dict = None
+            self._vlm_predicates = None
+
     def _create_state(self) -> State:
         if self._waiting_for_observation:
             return DefaultState
@@ -281,9 +293,19 @@ def _create_state(self) -> State:
         # logging.info("Simulator state:")
         # logging.info(simulator_state)
 
+        # Prepare the current images from observation
+        camera_images = self._camera_images if CFG.spot_vlm_eval_predicate else None
+
         # Now finish the state.
-        state = _PartialPerceptionState(percept_state.data,
-                                        simulator_state=simulator_state)
+        state = _PartialPerceptionState(
+            percept_state.data,
+            simulator_state=simulator_state,
+            camera_images=camera_images,
+            visible_objects=self._objects_in_view,
+            vlm_atom_dict=self._vlm_atom_dict,
+            vlm_predicates=self._vlm_predicates,
+        )
+        # DEBUG - look into dataclass field init - why warning
 
         return state
 
@@ -464,6 +486,13 @@ def _create_goal(self, state: State,
             block = Object("red_block", _movable_object_type)
             Holding = pred_name_to_pred["Holding"]
             return {GroundAtom(Holding, [robot, block])}
+        if goal_description == "pick the red block into the green bowl":
+            block = Object("red_block", _movable_object_type)
+            bowl = Object("green_bowl", _container_type)
+            Inside = pred_name_to_pred["Inside"]
+            return {
+                GroundAtom(Inside, [block, bowl]),
+            }
         if goal_description == "setup sweeping":
             robot = Object("robot", _robot_type)
             brush = Object("brush", _movable_object_type)

diff --git a/predicators/planning.py b/predicators/planning.py
@@ -1215,6 +1215,9 @@ def run_task_plan_once(
             raise PlanningFailure(
                 "Skeleton produced by A-star exceeds horizon!")
     elif "fd" in CFG.sesame_task_planner:  # pragma: no cover
+        # Run Fast Downward. See the instructions in the docstring of `_sesame_plan_with_fast_downward`
+        assert "FD_EXEC_PATH" in os.environ, \
+            "Please follow the instructions in the docstring of this method!"
         fd_exec_path = os.environ["FD_EXEC_PATH"]
         exec_str = os.path.join(fd_exec_path, "fast-downward.py")
         timeout_cmd = "gtimeout" if sys.platform == "darwin" \

diff --git a/predicators/pretrained_model_interface.py b/predicators/pretrained_model_interface.py
@@ -5,16 +5,20 @@
 """
 
 import abc
+import base64
 import logging
 import os
 import time
-from typing import List, Optional
+from io import BytesIO
+from typing import Dict, List, Optional
 
+import cv2
 import google
 import google.generativeai as genai
 import imagehash
 import openai
 import PIL.Image
+from tenacity import retry, stop_after_attempt, wait_random_exponential
 
 from predicators.settings import CFG
 
@@ -74,7 +78,7 @@ def sample_completions(self,
         model_id = self.get_id()
         prompt_id = hash(prompt)
         config_id = f"{temperature}_{seed}_{num_completions}_" + \
-                f"{stop_token}"
+                    f"{stop_token}"
         # If the temperature is 0, the seed does not matter.
         if temperature == 0.0:
             config_id = f"most_likely_{num_completions}_{stop_token}"
@@ -249,3 +253,109 @@ def _sample_completions(
                 time.sleep(3.0)
         response.resolve()
         return [response.text]
+
+
+class OpenAIVLM(VisionLanguageModel):
+    """Interface for OpenAI's VLMs, including GPT-4 Turbo (and preview
+    versions)."""
+
+    def __init__(self, model_name: str = "gpt-4-turbo", detail: str = "auto"):
+        """Initialize with a specific model name."""
+        self.model_name = model_name
+        self.detail = detail
+        assert "OPENAI_API_KEY" in os.environ
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+
+    def prepare_vision_messages(self,
+                                images: List[PIL.Image.Image],
+                                prefix: Optional[str] = None,
+                                suffix: Optional[str] = None,
+                                image_size: Optional[int] = 512,
+                                detail: str = "auto") -> List[Dict[str, str]]:
+        """Prepare text and image messages for the OpenAI API."""
+        content = []
+
+        if detail is None or detail == "auto":
+            detail = self.detail
+
+        if prefix:
+            content.append({"text": prefix, "type": "text"})
+
+        assert images
+        assert detail in ["auto", "low", "high"]
+        for img in images:
+            img_resized = img
+            if image_size:
+                factor = image_size / max(img.size)
+                img_resized = img.resize(
+                    (int(img.size[0] * factor), int(img.size[1] * factor)))
+
+            # Convert the image to PNG format and encode it in base64
+            buffer = BytesIO()
+            img_resized.save(buffer, format="PNG")
+            buffer_bytes = buffer.getvalue()
+            frame = base64.b64encode(buffer_bytes).decode("utf-8")
+
+            content.append({
+                "image_url": {
+                    "url": f"data:image/png;base64,{frame}",
+                    "detail": "auto"
+                },
+                "type": "image_url"
+            })
+
+        if suffix:
+            content.append({"text": suffix, "type": "text"})
+
+        return [{"role": "user", "content": content}]
+
+    @retry(wait=wait_random_exponential(min=1, max=60),
+           stop=stop_after_attempt(6))
+    def call_openai_api(self,
+                        messages: list,
+                        model: str = "gpt-4",
+                        seed: Optional[int] = None,
+                        max_tokens: int = 32,
+                        temperature: float = 0.2,
+                        verbose: bool = False) -> str:
+        """Make an API call to OpenAI."""
+        client = openai.OpenAI()
+        completion = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            seed=seed,
+            max_tokens=max_tokens,
+            temperature=temperature,
+        )
+        if verbose:
+            print(f"OpenAI API response: {completion}")
+        assert len(completion.choices) == 1
+        return completion.choices[0].message.content
+
+    def get_id(self) -> str:
+        """Get an identifier for the model."""
+        return f"OpenAI-{self.model_name}"
+
+    def _sample_completions(
+        self,
+        prompt: str,
+        imgs: Optional[List[PIL.Image.Image]],
+        temperature: float,
+        seed: int,
+        stop_token: Optional[str] = None,
+        num_completions: int = 1,
+        max_tokens: int = 512,
+    ) -> List[str]:
+        """Query the model and get responses."""
+        assert imgs is not None
+        messages = self.prepare_vision_messages(prefix=prompt,
+                                                images=imgs,
+                                                detail="auto")
+        responses = [
+            self.call_openai_api(messages,
+                                 model=self.model_name,
+                                 max_tokens=max_tokens,
+                                 temperature=temperature)
+            for _ in range(num_completions)
+        ]
+        return responses
diff --git a/predicators/settings.py b/predicators/settings.py
@@ -46,6 +46,9 @@ class GlobalSettings:
     # your call to utils.reset_config().
     render_state_dpi = 150
     approach_wrapper = None
+    # Use VLMs to evaluate some spatial predicates in visual environment,
+    # e.g., Sokoban. Still work in progress.
+    enable_vlm_eval_predicate = False
 
     # cover_multistep_options env parameters
     cover_multistep_action_limits = [-np.inf, np.inf]
@@ -178,6 +181,9 @@ class GlobalSettings:
     spot_run_dry = False
     spot_use_perfect_samplers = False  # for debugging
     spot_sweep_env_goal_description = "get the objects into the bucket"
+    # Evaluate some predicates with VLM; need additional setup; WIP
+    spot_vlm_eval_predicate = False
+    vlm_eval_verbose = False
 
     # pddl blocks env parameters
     pddl_blocks_procedural_train_min_num_blocks = 3

diff --git a/predicators/spot_utils/graph_nav_maps/b45-621/metadata.yaml b/predicators/spot_utils/graph_nav_maps/b45-621/metadata.yaml
@@ -46,4 +46,19 @@ static-object-features:
     length: 0.1
     width: 0.1
     placeable: 1
-    is_sweeper: 0
+    is_sweeper: 0
+    radius: 0.1  # TODO quick fix
+  green_bowl:
+    shape: 2
+    height: 0.5
+    length: 0.5
+    width: 0.5
+    placeable: 0
+    is_sweeper: 0
+    radius: 0.2  # TODO quick fox
+
+# NOTE: Not sure what these mean, but have to be there?
+prepare_container_relative_xy:
+  dx: -1.0
+  dy: 0.1
+  angle: -1.5707  # - pi / 2
diff --git a/predicators/spot_utils/perception/object_detection.py b/predicators/spot_utils/perception/object_detection.py
@@ -473,15 +473,17 @@ def get_random_mask_pixel_from_artifacts(
     mask_idx = rng.choice(len(pixels_in_mask))
     pixel_tuple = (pixels_in_mask[1][mask_idx], pixels_in_mask[0][mask_idx])
     # Uncomment to plot the grasp pixel being selected!
-    # rgb_img = artifacts["language"]["rgbds"][camera_name].rgb
-    # _, axes = plt.subplots()
-    # axes.imshow(rgb_img)
-    # axes.add_patch(
-    #     plt.Rectangle((pixel_tuple[0], pixel_tuple[1]), 5, 5, color='red'))
-    # plt.tight_layout()
-    # outdir = Path(CFG.spot_perception_outdir)
-    # plt.savefig(outdir / "grasp_pixel.png", dpi=300)
-    # plt.close()
+    """
+    rgb_img = artifacts["language"]["rgbds"][camera_name].rgb
+    _, axes = plt.subplots()
+    axes.imshow(rgb_img)
+    axes.add_patch(
+        plt.Rectangle((pixel_tuple[0], pixel_tuple[1]), 5, 5, color='red'))
+    plt.tight_layout()
+    outdir = Path(CFG.spot_perception_outdir)
+    plt.savefig(outdir / "grasp_pixel.png", dpi=300)
+    plt.close()
+    """
     return pixel_tuple