From ea8c5927d9b92419e46e7954a8a975da59816e83 Mon Sep 17 00:00:00 2001
From: Naoki Yokoyama <nyokoyama@theaiinstitute.com>
Date: Tue, 12 Sep 2023 21:35:30 -0400
Subject: [PATCH] making yolo loading configurable (disabled for real world),
 setting vqa to be false by default, using position control for Spot base
 instead of explicit euler integrations of veclocities that don't really even
 work very well tbh

---
 zsos/policy/base_objectnav_policy.py |  7 ++--
 zsos/policy/reality_policies.py      |  5 +++
 zsos/reality/pointnav_env.py         | 51 +++++++++++++++++++++-------
 3 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/zsos/policy/base_objectnav_policy.py b/zsos/policy/base_objectnav_policy.py
index 279aaef..bd42e89 100644
--- a/zsos/policy/base_objectnav_policy.py
+++ b/zsos/policy/base_objectnav_policy.py
@@ -36,6 +36,7 @@ class BaseObjectNavPolicy(BasePolicy):
     _stop_action: Tensor = None  # MUST BE SET BY SUBCLASS
     _observations_cache: Dict[str, Any] = {}
     _non_coco_caption = ""
+    _load_yolo: bool = True
 
     def __init__(
         self,
@@ -50,7 +51,7 @@ def __init__(
         agent_radius: float = 0.18,
         obstacle_map_area_threshold: float = 1.5,
         hole_area_thresh: int = 100000,
-        use_vqa: bool = True,
+        use_vqa: bool = False,
         vqa_prompt: str = "Is this ",
         coco_threshold: float = 0.6,
         non_coco_threshold: float = 0.4,
@@ -229,7 +230,7 @@ def _get_policy_info(self, detections: ObjectDetections) -> Dict[str, Any]:
 
     def _get_object_detections(self, img: np.ndarray) -> ObjectDetections:
         target_classes = self._target_object.split("|")
-        has_coco = any(c in COCO_CLASSES for c in target_classes)
+        has_coco = any(c in COCO_CLASSES for c in target_classes) and self._load_yolo
         has_non_coco = any(c not in COCO_CLASSES for c in target_classes)
 
         detections = (
@@ -406,7 +407,7 @@ class ZSOSConfig:
     min_obstacle_height: float = 0.61
     max_obstacle_height: float = 0.88
     hole_area_thresh: int = 100000
-    use_vqa: bool = True
+    use_vqa: bool = False
     vqa_prompt: str = "Is this "
     coco_threshold: float = 0.6
     non_coco_threshold: float = 0.4
diff --git a/zsos/policy/reality_policies.py b/zsos/policy/reality_policies.py
index 92ea854..4038104 100644
--- a/zsos/policy/reality_policies.py
+++ b/zsos/policy/reality_policies.py
@@ -21,6 +21,11 @@ class RealityMixin:
     """
 
     _stop_action: Tensor = torch.tensor([[0.0, 0.0]], dtype=torch.float32)
+    _load_yolo: bool = False
+    _non_coco_caption: str = (
+        "chair . table . tv . laptop . microwave . toaster . sink . refrigerator . book"
+        " . clock . vase . scissors . teddy bear . hair drier . toothbrush ."
+    )
 
     def __init__(self: BaseObjectNavPolicy, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
diff --git a/zsos/reality/pointnav_env.py b/zsos/reality/pointnav_env.py
index 1fc1e07..dc2023c 100644
--- a/zsos/reality/pointnav_env.py
+++ b/zsos/reality/pointnav_env.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from zsos.reality.robots.base_robot import BaseRobot
+from zsos.reality.robots.bdsw_robot import BDSWRobot
 from zsos.reality.robots.camera_ids import SpotCamIds
 from zsos.utils.geometry_utils import convert_to_global_frame, rho_theta
 
@@ -18,8 +18,7 @@ class PointNavEnv:
 
     def __init__(
         self,
-        robot: BaseRobot,
-        # robot: BDSWRobot,
+        robot: BDSWRobot,
         max_body_cam_depth: float = 3.5,  # max depth (in meters) for body cameras
         max_lin_dist: float = 0.25,
         max_ang_dist: float = np.deg2rad(30),
@@ -33,6 +32,7 @@ def __init__(
         self._max_lin_dist = max_lin_dist
         self._max_ang_dist = max_ang_dist
         self._time_step = time_step
+        self._cmd_id = None
 
     def reset(self, goal: Any, relative=True, *args, **kwargs) -> Dict[str, np.ndarray]:
         assert isinstance(goal, np.ndarray)
@@ -46,15 +46,42 @@ def reset(self, goal: Any, relative=True, *args, **kwargs) -> Dict[str, np.ndarr
         return self._get_obs()
 
     def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]:
-        ang_vel, lin_vel = self._compute_velocities(action)
-        # self.robot.command_base_velocity(ang_vel, lin_vel)
-        time.sleep(self._time_step)
-        self.robot.command_base_velocity(0.0, 0.0)
-        done = ang_vel == 0.0 and lin_vel == 0.0
+        if self._cmd_id is not None:
+            cmd_status = 0
+            while cmd_status != 1:
+                feedback_resp = self.robot.spot.get_cmd_feedback(self._cmd_id)
+                cmd_status = (
+                    feedback_resp.feedback.synchronized_feedback
+                ).mobility_command_feedback.se2_trajectory_feedback.status
+                if cmd_status != 1:
+                    time.sleep(0.1)
+
+        ang_dist, lin_dist = self._compute_displacements(action)
+        self._cmd_id = self.robot.spot.set_base_position(
+            x_pos=lin_dist,
+            y_pos=0,
+            yaw=ang_dist,
+            end_time=100,
+            relative=True,
+            max_fwd_vel=1.0,
+            max_hor_vel=0.5,
+            max_ang_vel=np.deg2rad(60),
+            disable_obstacle_avoidance=False,
+            blocking=False,
+        )
+        done = ang_dist == 0.0 and lin_dist == 0.0
         return self._get_obs(), 0.0, done, {}  # not using info dict yet
 
     def _compute_velocities(self, action: Dict[str, np.ndarray]) -> Tuple[float, float]:
-        velocities = []
+        ang_dist, lin_dist = self._compute_displacements(action)
+        ang_vel = ang_dist / self._time_step
+        lin_vel = lin_dist / self._time_step
+        return ang_vel, lin_vel
+
+    def _compute_displacements(
+        self, action: Dict[str, np.ndarray]
+    ) -> Tuple[float, float]:
+        displacements = []
         for action_key, max_dist in (
             ["angular", self._max_ang_dist],
             ["linear", self._max_lin_dist],
@@ -62,9 +89,9 @@ def _compute_velocities(self, action: Dict[str, np.ndarray]) -> Tuple[float, flo
             act_val = action.get(action_key, 0.0)  # default to 0.0 if key not present
             dist = np.clip(act_val, -1.0, 1.0)  # clip to [-1, 1]
             dist *= max_dist  # scale to max distance
-            velocities.append(dist / self._time_step)  # convert to velocity
-        ang_vel, lin_vel = velocities
-        return ang_vel, lin_vel
+            displacements.append(dist)  # convert to velocity
+        ang_dist, lin_dist = displacements
+        return ang_dist, lin_dist
 
     def _get_obs(self) -> Dict[str, np.ndarray]:
         return {