From a8a6e85f940f2a3b45c97e99625b48461721cdb0 Mon Sep 17 00:00:00 2001
From: Katrina Ashton <146023091+kashton-bdai@users.noreply.github.com>
Date: Mon, 16 Oct 2023 12:17:54 -0400
Subject: [PATCH] Fixes for things that go wrong when running (#15)

Caption that doesn't get updated if non-default is used and check that
was only checking if something was None instead of also if it was False.
Also fix mypy pre-commit issue.
---
 vlfm/policy/base_objectnav_policy.py | 2 +-
 vlfm/vlm/grounding_dino.py           | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vlfm/policy/base_objectnav_policy.py b/vlfm/policy/base_objectnav_policy.py
index 0ac3783..abc73c8 100644
--- a/vlfm/policy/base_objectnav_policy.py
+++ b/vlfm/policy/base_objectnav_policy.py
@@ -350,7 +350,7 @@ def _update_object_map(
 
             # If we are using vqa, then use the BLIP2 model to visually confirm whether
             # the contours are actually correct.
-            if self._use_vqa is not None:
+            if (self._use_vqa is not None) and self._use_vqa:
                 contours, _ = cv2.findContours(
                     object_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
                 )
diff --git a/vlfm/vlm/grounding_dino.py b/vlfm/vlm/grounding_dino.py
index e2aa92d..25808e4 100644
--- a/vlfm/vlm/grounding_dino.py
+++ b/vlfm/vlm/grounding_dino.py
@@ -39,9 +39,7 @@ def __init__(
         self.box_threshold = box_threshold
         self.text_threshold = text_threshold
 
-    def predict(
-        self, image: np.ndarray, caption: Optional[str] = ""
-    ) -> ObjectDetections:
+    def predict(self, image: np.ndarray, caption: str = "") -> ObjectDetections:
         """
         This function makes predictions on an input image tensor or numpy array using a
         pretrained model.
@@ -62,6 +60,8 @@ def predict(
         )
         if caption == "":
             caption_to_use = self.caption
+        else:
+            caption_to_use = caption
         print("Caption:", caption_to_use)
         with torch.inference_mode():
             boxes, logits, phrases = predict(