diff --git a/.github/workflows/predicators.yml b/.github/workflows/predicators.yml
index f6e744ae6c..1739fbb188 100644
--- a/.github/workflows/predicators.yml
+++ b/.github/workflows/predicators.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: ["3.10.14"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: ["3.10.14"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
@@ -40,8 +40,7 @@ jobs:
     - name: Install dependencies
       run: |
         pip install -e .
-        pip install -U git+https://github.com/python/mypy.git@9a10967fdaa2ac077383b9eccded42829479ef31
-      # Note: if mypy issue #5485 gets resolved, we can install from head again.
+        pip install mypy==1.8.0
     - name: Mypy
       run: |
         mypy . --config-file mypy.ini
@@ -49,7 +48,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: ["3.10.14"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
@@ -71,7 +70,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: ["3.10.14"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
@@ -95,7 +94,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: ["3.10.14"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
@@ -116,7 +115,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: ["3.10.14"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.gitignore b/.gitignore
index 92d64bae65..337123a38e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,7 +19,7 @@ logs
 saved_approaches
 saved_datasets
 scripts/results
-llm_cache
+pretrained_model_cache
 machines.txt
 *_vision_data
 tests/_fake_trajs
diff --git a/README.md b/README.md
index 9df8464097..1ceb61c512 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Methods for predicate learning are implemented as Approaches (e.g., `predicators
 A simple implementation of search-then-sample bilevel planning is provided in `predicators/planning.py`. This implementation uses the "SeSamE" strategy: SEarch-and-SAMple planning, then Execution.
 
 ## Installation
-* This repository uses Python versions 3.8+.
+* This repository uses Python versions 3.10-3.11. We recommend 3.10.14.
 * Run `pip install -e .` to install dependencies.
 
 ## Instructions For Running Code
diff --git a/mypy.ini b/mypy.ini
index 724bcd439f..a262ef4ed4 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -2,7 +2,7 @@
 strict_equality = True
 disallow_untyped_calls = True
 warn_unreachable = True
-exclude = (predicators/envs/assets|predicators/third_party|venv)
+exclude = (predicators/envs/assets|venv)
 
 [mypy-predicators.*]
 disallow_untyped_defs = True
@@ -15,6 +15,7 @@ ignore_missing_imports = True
 
 [mypy-predicators.third_party.*]
 ignore_missing_imports = True
+ignore_errors = True
 
 [mypy-setuptools.*]
 ignore_missing_imports = True
@@ -127,3 +128,12 @@ ignore_missing_imports = True
 
 [mypy-pbrspot.*]
 ignore_missing_imports = True
+
+[mypy-ImageHash.*]
+ignore_missing_imports = True
+
+[mypy-google.*]
+ignore_missing_imports = True
+
+[mypy-google.generativeai.*]
+ignore_missing_imports = True
diff --git a/predicators/approaches/active_sampler_learning_approach.py b/predicators/approaches/active_sampler_learning_approach.py
index e18cf52135..3209c908d1 100644
--- a/predicators/approaches/active_sampler_learning_approach.py
+++ b/predicators/approaches/active_sampler_learning_approach.py
@@ -737,13 +737,13 @@ def _wrap_object_specific_samplers(
     base_sampler: NSRTSampler,
 ) -> NSRTSampler:
 
-    def _wrapped_sampler(state: State, goal: Set[GroundAtom],
-                         rng: np.random.Generator,
-                         objects: Sequence[Object]) -> Array:
+    def _wrapped_sampler(
+            state: State, goal: Set[GroundAtom], rng: np.random.Generator,
+            objects: Sequence[Object]) -> Array:  # pragma: no cover
         objects_tuple = tuple(objects)
         # If we haven't yet learned a object-specific sampler for these objects
         # then use the base sampler.
-        if objects_tuple not in object_specific_samplers:  # pragma: no cover
+        if objects_tuple not in object_specific_samplers:
             return base_sampler(state, goal, rng, objects)
         sampler = object_specific_samplers[objects_tuple]
         return sampler(state, goal, rng, objects)
diff --git a/predicators/approaches/documentation/grammar_search_invention_approach.md b/predicators/approaches/documentation/grammar_search_invention_approach.md
new file mode 100644
index 0000000000..2919dfa218
--- /dev/null
+++ b/predicators/approaches/documentation/grammar_search_invention_approach.md
@@ -0,0 +1,122 @@
+# Grammar Search Invention Approach
+This approach is primarily useful for inventing predicates via program synthesis from demonstrations, as described originally in:
+[Predicate Invention for Bilevel Planning](https://arxiv.org/abs/2203.09634). Silver*, Chitnis*, Kumar, McClinton, Lozano-Perez, Kaelbling, Tenenbaum. AAAI 2023.
+
+An example command for running the approach from that paper is:
+```
+python predicators/main.py --env cover --approach grammar_search_invention --excluded_predicates all --num_train_tasks 50
+```
+
+Last updated: 04/28/2024
+
+## Inventing predicates by leveraging a VLM
+We can leverage a VLM to propose concepts that form the basis of the grammar used for predicate invention. This has two advantages: (1) invented predicates operate directly on images, (2) the names of predicates correspond to common-sense concepts.
+
+To do this, we need to supply demonstrations in the form of a sequence of images and labelled options corresponding to the `_Option` that the robot used to get between subsequent states corresponding to subsequent images. 
+
+### Creating datasets for VLM predicate invention
+Demonstrations should be saved as a subfolder in the `saved_datasets` folder. The folder should be named `<env_name>__vlm_demos__<seed>_<num_demos>`. For instance, `apple_coring__vlm_demos__456__1`.
+Within the folder, there should be 1 subfolder for every demonstration trajectory. So in the above example, there should be exactly 1 subfolder. Name each of these subfolders `traj_<demonstration_number>` with 0-indexing (e.g., `traj_0` for the first demo).
+Within each traj subfolder, there should be two things:
+1. a subfolder corresponding to each timestep for the demonstration. This will contain all the images (potentially from multiple camera views) that correspond to the observation at the current timestep. 
+2. an `options_traj.txt` file that lists out the series of options executed between each of the states.
+
+The `options_traj.txt` file should contain strings corresponding to the options executed as part of the trajectory. The format for each option should be `<option_name>(<objects>, [<continuous_params>])`.
+An example file might look like:
+```
+pick(apple, [])
+place_on(apple, plate, [])
+pick(slicing_tool, [])
+slice(slicing_tool, apple, hand, [])
+```
+
+Given this, a sample folder structure for a demonstration might look like:
+apple_coring__vlm_demos__456__2
+| traj0
+    | 0
+        | 0.jpg
+    | 1
+        | 1.jpg
+    | 2
+        | 2.jpg
+    | 3
+        | 3.jpg
+    | 4
+        | 4.jpg
+    | 5
+        | 5.jpg
+    | options.txt
+| traj1
+    | 0
+        | 0.jpg
+    | 1
+        | 1.jpg
+    | 2
+        | 2.jpg
+    | 3
+        | 3.jpg
+    | 4
+        | 4.jpg
+    | 5
+        | 5.jpg
+    | options.txt
+
+### Running predicate invention using these image demos
+To use the Gemini VLM, you need to set the `GOOGLE_API_KEY` environment variable in your terminal. You can make/get an API key [here](https://aistudio.google.com/app/apikey).
+
+Example command: `python predicators/main.py --env apple_coring --seed 456 --approach grammar_search_invention --excluded_predicates all --num_train_tasks 1 --num_test_tasks 0 --offline_data_method img_demos --vlm_trajs_folder_name apple_coring__vlm_demos__456__1`
+
+The important flags here are the `--offline_data_method img_demos` and the `--vlm_trajs_folder_name apple_coring__vlm_demos__456__1`. The latter should point to the folder housing the demonstration set of interest!
+
+Note that VLM responses are always cached, so if you run the command on a demonstration set and then rerun it, it should be much faster since it's using cached responses!
+
+Also, the code saves a human-readable txt file to the `saved_datasets` folder that contains a text representation of the GroundAtomTrajectories. You can manually inspect and even edit this file, and then rerun the rest of the predicate invention pipeline starting from this file alone (and not the original demos) as input. Here's an example command that does that:
+`python predicators/main.py --env apple_coring --seed 456 --approach grammar_search_invention --excluded_predicates all --num_train_tasks 1 --offline_data_method demo+labelled_atoms --handmade_demo_filename apple_coring__demo+labelled_atoms__manual__1.txt`
+
+where `apple_coring__demo+labelled_atoms__manual__1.txt` is the human-readable txt file.
+
+### Structure of human-readable txt files
+We assume the txt files have a particular structure that we leverage for parsing. To explain these components, consider this below example:
+
+```
+===
+{*Holding(spoon): True.
+*Submerged(teabag): False.
+*Submerged(spoon): False.} ->
+
+pick(teabag, hand)[] -> 
+
+{*Holding(spoon): True.
+*Submerged(teabag): False.
+*Submerged(spoon): False.} ->
+
+place_in(teabag, cup)[] -> 
+
+{*Holding(spoon): True.
+*Submerged(teabag): False.
+*Submerged(spoon): False.} ->
+
+pick(spoon, hand)[] -> 
+
+{*Holding(spoon): True.
+*Submerged(teabag): False.
+*Submerged(spoon): False.} ->
+
+place_in(spoon, cup)[] -> 
+
+{*Holding(spoon): True.
+*Submerged(teabag): False.
+*Submerged(spoon): False.}
+===
+```
+
+**Components**
+- Separator: '===' is used to separate one trajectory from another (so a trajectory is sandwiched between two lines that have only '===' on them). In the above example, there is exactly one demonstration trajectory.
+- State: Each state is a bulleted list of atoms enclosed between set brackets {}. In the above example, there are 5 states. Note importantly that the format of every atom should be `*<predicate_name>(<ob1_name>, <obj2_name>, ...).`. The `*` at the start, and the period `.` at the end are very important.
+- Skill: Each skill is sandwiched between two states and takes the format: `<skill_name>(<ob1_name>, <obj2_name>, ...)[<continuous_param_vector>]`. In the above example, there are 4 skills. Note that after every state, there is a `->` character, followed by a newline, then a skill followed by another `->` character and newline. This is also critical to parsing. Note also that the above example doesn't feature any continuous parameters.
+
+
+### Future features to be added
+* Enable pipeline to consider demonstrations that have low-level object-oriented state, as well as image observations.
+* Enable invented VLM predicates to actually be used and run at test-time.
+* Consider different VLM's
\ No newline at end of file
diff --git a/predicators/approaches/grammar_search_invention_approach.py b/predicators/approaches/grammar_search_invention_approach.py
index 2bca2dbf08..c4fbf5b967 100644
--- a/predicators/approaches/grammar_search_invention_approach.py
+++ b/predicators/approaches/grammar_search_invention_approach.py
@@ -9,8 +9,8 @@
 from dataclasses import dataclass, field
 from functools import cached_property
 from operator import le
-from typing import Any, Callable, Dict, FrozenSet, Iterator, List, Sequence, \
-    Set, Tuple
+from typing import Any, Callable, Dict, FrozenSet, Iterator, List, Optional, \
+    Sequence, Set, Tuple
 
 import numpy as np
 from gym.spaces import Box
@@ -24,8 +24,8 @@
 from predicators.predicate_search_score_functions import \
     _PredicateSearchScoreFunction, create_score_function
 from predicators.settings import CFG
-from predicators.structs import Dataset, GroundAtomTrajectory, Object, \
-    ParameterizedOption, Predicate, Segment, State, Task, Type
+from predicators.structs import Dataset, GroundAtom, GroundAtomTrajectory, \
+    Object, ParameterizedOption, Predicate, Segment, State, Task, Type
 
 ################################################################################
 #                          Programmatic classifiers                            #
@@ -42,6 +42,13 @@ def _create_grammar(dataset: Dataset,
         diff_grammar = _FeatureDiffInequalitiesPredicateGrammar(dataset)
         grammar = _ChainPredicateGrammar([grammar, diff_grammar],
                                          alternate=True)
+    if CFG.grammar_search_grammar_use_euclidean_dist:
+        for (t1_f1, t1_f2, t2_f1,
+             t2_f2) in CFG.grammar_search_euclidean_feature_names:
+            euclidean_dist_grammar = _EuclideanDistancePredicateGrammar(
+                dataset, t1_f1, t2_f1, t1_f2, t2_f2)
+            grammar = _ChainPredicateGrammar([grammar, euclidean_dist_grammar],
+                                             alternate=True)
     # We next optionally add in the given predicates because we want to allow
     # negated and quantified versions of the given predicates, in
     # addition to negated and quantified versions of new predicates.
@@ -64,7 +71,10 @@ def _create_grammar(dataset: Dataset,
     # because if any predicates are equivalent to the given predicates,
     # we would not want to generate them. Don't do this if we're using
     # DebugGrammar, because we don't want to prune things that are in there.
-    if not CFG.grammar_search_use_handcoded_debug_grammar:
+    # Also don't do this if we explicitly don't want to prune such
+    # predicates.
+    if not CFG.grammar_search_use_handcoded_debug_grammar and \
+        CFG.grammar_search_prune_redundant_preds:
         grammar = _PrunedGrammar(dataset, grammar)
     # We don't actually need to enumerate the given predicates
     # because we already have them in the initial predicate set,
@@ -213,6 +223,57 @@ def pretty_str(self) -> Tuple[str, str]:
         return vars_str, body_str
 
 
+@dataclass(frozen=True, eq=False, repr=False)
+class _EuclideanAttributeDiffCompareClassifier(_BinaryClassifier):
+    """Compare the euclidean distance between two feature values with a
+    constant value."""
+    object1_index: int
+    object1_type: Type
+    obj1_attr1_name: str
+    obj1_attr2_name: str
+    object2_index: int
+    object2_type: Type
+    obj2_attr1_name: str
+    obj2_attr2_name: str
+    constant: float
+    constant_idx: int
+    compare: Callable[[float, float], bool]
+    compare_str: str
+
+    def _classify_object(self, s: State, obj1: Object, obj2: Object) -> bool:
+        assert obj1.type == self.object1_type
+        assert obj2.type == self.object2_type
+        return self.compare((s.get(obj1, self.obj1_attr1_name) -
+                             s.get(obj2, self.obj2_attr1_name))**2 +
+                            (s.get(obj1, self.obj1_attr2_name) -
+                             s.get(obj2, self.obj2_attr2_name))**2,
+                            self.constant)
+
+    def __str__(self) -> str:
+        return (f"((({self.object1_index}:{self.object1_type.name})."
+                f"{self.obj1_attr1_name} - ({self.object2_index}:"
+                f"{self.object2_type.name}).{self.obj2_attr1_name})^2"
+                f" + (({self.object1_index}:{self.object1_type.name})."
+                f"{self.obj1_attr2_name} - ({self.object2_index}:"
+                f"{self.object2_type.name}).{self.obj2_attr2_name})^2)"
+                f"{self.compare_str}[idx {self.constant_idx}]"
+                f"{self.constant:.3})")
+
+    def pretty_str(self) -> Tuple[str, str]:
+        name1 = CFG.grammar_search_classifier_pretty_str_names[
+            self.object1_index]
+        name2 = CFG.grammar_search_classifier_pretty_str_names[
+            self.object2_index]
+        vars_str = (f"{name1}:{self.object1_type.name}, "
+                    f"{name2}:{self.object2_type.name}")
+        body_str = (f"(({name1}.{self.obj1_attr1_name} - "
+                    f"{name2}.{self.obj2_attr1_name})^2 "
+                    f" + (({name1}.{self.obj1_attr2_name} - "
+                    f"{name2}.{self.obj2_attr2_name})^2 "
+                    f"{self.compare_str} {self.constant:.3})")
+        return vars_str, body_str
+
+
 @dataclass(frozen=True, eq=False, repr=False)
 class _NegationClassifier(_ProgrammaticClassifier):
     """Negate a given classifier."""
@@ -385,6 +446,21 @@ def enumerate(self) -> Iterator[Tuple[Predicate, float]]:
     "repeated_nextto_single_option": [
         "(|(0:dot).x - (1:robot).x|<=[idx 7]6.25)",  # NextTo
     ],
+    "stick_button_move": [
+        # NOTE: changing the demonstration data slightly causes
+        # the value of the constant to change. Need to uncomment these
+        # as necessary.
+        # StickAboveButton
+        "(((0:button).x - (1:stick).tip_x)^2 + ((0:button).y - " + \
+            "(1:stick).tip_y)^2)<=[idx 0]0.18)",
+        # RobotAboveButton
+        "(((0:button).x - (1:robot).x)^2 + ((0:button).y - " + \
+            "(1:robot).y)^2)<=[idx 0]0.194)",
+        "((0:stick).held<=[idx 0]0.5)",  # Handempty
+        "NOT-((0:stick).held<=[idx 0]0.5)",  # Grasped
+        "((0:button).y<=[idx 0]3.01)",  # ButtonReachableByRobot
+        "NOT-((0:button).y<=[idx 0]3.01)",  # ButtonNotReachableByRobot
+    ],
     "unittest": [
         "((0:robot).hand<=[idx 0]0.65)", "((0:block).grasp<=[idx 0]0.0)",
         "NOT-Forall[0:block].[((0:block).width<=[idx 0]0.085)(0)]"
@@ -513,56 +589,150 @@ class _FeatureDiffInequalitiesPredicateGrammar(
         _SingleFeatureInequalitiesPredicateGrammar):
     """Generates features of the form "|0.feature - 1.feature| <= c"."""
 
+    def _yield_pred_given_const(
+            self, feature_ranges: Dict[Type, Dict[str, Tuple[float, float]]],
+            constant_idx: int, constant: float,
+            cost: float) -> Iterator[Tuple[Predicate, float]]:
+        for (t1, t2) in itertools.combinations_with_replacement(
+                sorted(self.types), 2):
+            for f1 in t1.feature_names:
+                for f2 in t2.feature_names:
+                    # To create our classifier, we need to leverage the
+                    # upper and lower bounds of its features.
+                    # First, we extract these and move on if these
+                    # bounds are relatively close together.
+                    lb1, ub1 = feature_ranges[t1][f1]
+                    if abs(lb1 - ub1) < 1e-6:
+                        continue
+                    lb2, ub2 = feature_ranges[t2][f2]
+                    if abs(lb2 - ub2) < 1e-6:
+                        continue
+                    lb, ub = utils.compute_abs_range_given_two_ranges(
+                        lb1, ub1, lb2, ub2)
+                    # Scale the constant by the correct range.
+                    k = constant * (ub - lb) + lb
+                    # Create classifier.
+                    comp, comp_str = le, "<="
+                    diff_classifier = _AttributeDiffCompareClassifier(
+                        0, t1, f1, 1, t2, f2, k, constant_idx, comp, comp_str)
+                    name = str(diff_classifier)
+                    types = [t1, t2]
+                    pred = Predicate(name, types, diff_classifier)
+                    assert pred.arity == 2
+                    yield (pred, cost)
+
     def enumerate(self) -> Iterator[Tuple[Predicate, float]]:
         # Get ranges of feature values from data.
         feature_ranges = self._get_feature_ranges()
         # Edge case: if there are no features at all, return immediately.
         if not any(r for r in feature_ranges.values()):
             return
+        # Start by generating predicates such that the two features are
+        # very close together. The reason we can't just set the constant
+        # to 1e-6 is because objects have some amount of "size", and so even
+        # when they're touching, it's not like their centers overlap.
+        # E.g. in stick button, when the robot touches the button, the center
+        # of the robot and the object might still be offset by a bit.
+        for ret_val in self._yield_pred_given_const(
+                feature_ranges, 0,
+                CFG.grammar_search_diff_features_const_multiplier, 4.0):
+            yield ret_val
+        # 0.5, 0.25, 0.75, 0.125, 0.375, ...
+        constant_generator = _halving_constant_generator(0.0, 1.0)
+        for constant_idx, (constant, cost) in enumerate(constant_generator):
+            for ret_val in self._yield_pred_given_const(
+                    feature_ranges, constant_idx, constant, cost):
+                yield ret_val
+
+
+@dataclass(frozen=True, eq=False, repr=False)
+class _EuclideanDistancePredicateGrammar(
+        _SingleFeatureInequalitiesPredicateGrammar):
+    """Generates predicates of the form "|0.x - 1.x|^2 + |0.y - 1.y|^2 <= c^2".
+    Importantly, this only operates over types that have features
+    named f1_name and f2_name.
+    """
+    t1_f1_name: str
+    t2_f1_name: str
+    t1_f2_name: str
+    t2_f2_name: str
+
+    def _compute_xy_bounds(self, feature_ranges: Dict[Type,
+                                                      Dict[str, Tuple[float,
+                                                                      float]]],
+                           t1: Type, t2: Type) -> Tuple[float, float]:
+        # To create our classifier, we need to leverage the
+        # upper and lower bounds of its x, y features.
+        lbx1, ubx1 = feature_ranges[t1][self.t1_f1_name]
+        lbx2, ubx2 = feature_ranges[t2][self.t2_f1_name]
+        lby1, uby1 = feature_ranges[t1][self.t1_f2_name]
+        lby2, uby2 = feature_ranges[t2][self.t2_f2_name]
+        # Compute the upper and lower bounds of each feature range.
+        lbx, ubx = utils.compute_abs_range_given_two_ranges(
+            lbx1, ubx1, lbx2, ubx2)
+        lby, uby = utils.compute_abs_range_given_two_ranges(
+            lby1, uby1, lby2, uby2)
+        # Now, use these to compute the upper and lower bounds of
+        # the squared expression of interest.
+        lb = lbx**2 + lby**2
+        ub = ubx**2 + uby**2
+        return (lb, ub)
+
+    def _generate_pred_given_constant(self, constant_idx: int, constant: float,
+                                      t1: Type, t2: Type, t1_f1_name: str,
+                                      t1_f2_name: str, t2_f1_name: str,
+                                      t2_f2_name: str) -> Predicate:
+        # Create classifier.
+        comp, comp_str = le, "<="
+        diff_classifier = _EuclideanAttributeDiffCompareClassifier(
+            0, t1, t1_f1_name, t1_f2_name, 1, t2, t2_f1_name, t2_f2_name,
+            constant, constant_idx, comp, comp_str)
+        name = str(diff_classifier)
+        types = [t1, t2]
+        pred = Predicate(name, types, diff_classifier)
+        return pred
+
+    def enumerate(self) -> Iterator[Tuple[Predicate, float]]:
+        # Get ranges of feature values from data.
+        feature_ranges = self._get_feature_ranges()
+        # Edge case: if there are no features at all, return immediately.
+        if not any(r for r in feature_ranges.values()):
+            return
+
+        # Start by generating predicates with a very small constant,
+        # to indicate that the objects are touching/overlapped.
+        for (t1, t2) in itertools.combinations_with_replacement(
+                sorted(self.types), 2):
+            if (self.t1_f1_name in t1.feature_names
+                    and self.t2_f1_name in t2.feature_names
+                    and self.t1_f2_name in t1.feature_names
+                    and self.t2_f2_name in t2.feature_names):
+                lb, ub = self._compute_xy_bounds(feature_ranges, t1, t2)
+                constant = ((ub - lb) *
+                            CFG.grammar_search_euclidean_const_multiplier) + lb
+                pred = self._generate_pred_given_constant(
+                    0, constant, t1, t2, self.t1_f1_name, self.t1_f2_name,
+                    self.t2_f1_name, self.t2_f2_name)
+                assert pred.arity == 2
+                yield (pred, 3.0)  # cost = arity + cost from constant
+
         # 0.5, 0.25, 0.75, 0.125, 0.375, ...
         constant_generator = _halving_constant_generator(0.0, 1.0)
         for constant_idx, (constant, cost) in enumerate(constant_generator):
             for (t1, t2) in itertools.combinations_with_replacement(
                     sorted(self.types), 2):
-                for f1 in t1.feature_names:
-                    for f2 in t2.feature_names:
-                        # To create our classifier, we need to leverage the
-                        # upper and lower bounds of its features.
-                        # First, we extract these and move on if these
-                        # bounds are relatively close together.
-                        lb1, ub1 = feature_ranges[t1][f1]
-                        if abs(lb1 - ub1) < 1e-6:
-                            continue
-                        lb2, ub2 = feature_ranges[t2][f2]
-                        if abs(lb2 - ub2) < 1e-6:
-                            continue
-                        # Now, we must compute the upper and lower bounds of
-                        # the expression |t1.f1 - t2.f2|. If the intervals
-                        # [lb1, ub1] and [lb2, ub2] overlap, then the lower
-                        # bound of the expression is just 0. Otherwise, if
-                        # lb2 > ub1, the lower bound is |ub1 - lb2|, and if
-                        # ub2 < lb1, the lower bound is |lb1 - ub2|.
-                        if utils.f_range_intersection(lb1, ub1, lb2, ub2):
-                            lb = 0.0
-                        else:
-                            lb = min(abs(lb2 - ub1), abs(lb1 - ub2))
-                        # The upper bound for the expression can be
-                        # computed in a similar fashion.
-                        ub = max(abs(ub2 - lb1), abs(ub1 - lb2))
-
-                        # Scale the constant by the correct range.
-                        k = constant * (ub - lb) + lb
-                        # Create classifier.
-                        comp, comp_str = le, "<="
-                        diff_classifier = _AttributeDiffCompareClassifier(
-                            0, t1, f1, 1, t2, f2, k, constant_idx, comp,
-                            comp_str)
-                        name = str(diff_classifier)
-                        types = [t1, t2]
-                        pred = Predicate(name, types, diff_classifier)
-                        assert pred.arity == 2
-                        yield (pred, 2 + cost
-                               )  # cost = arity + cost from constant
+                if (self.t1_f1_name in t1.feature_names
+                        and self.t2_f1_name in t2.feature_names
+                        and self.t1_f2_name in t1.feature_names
+                        and self.t2_f2_name in t2.feature_names):
+                    lb, ub = self._compute_xy_bounds(feature_ranges, t1, t2)
+                    # Scale the constant by the correct range.
+                    k = constant * (ub - lb) + lb
+                    pred = self._generate_pred_given_constant(
+                        constant_idx + 1, k, t1, t2, self.t1_f1_name,
+                        self.t1_f2_name, self.t2_f1_name, self.t2_f2_name)
+                    assert pred.arity == 2
+                    yield (pred, 2 + cost)  # cost = arity + cost from constant
 
 
 @dataclass(frozen=True, eq=False, repr=False)
@@ -753,21 +923,85 @@ def get_name(cls) -> str:
     def _get_current_predicates(self) -> Set[Predicate]:
         return self._initial_predicates | self._learned_predicates
 
-    def learn_from_offline_dataset(self, dataset: Dataset) -> None:
+    def _generate_atom_dataset_via_grammar(
+        self, dataset: Dataset
+    ) -> Tuple[List[GroundAtomTrajectory], Dict[Predicate, float]]:
+        """Generates predicates from a grammar, and applies them to the
+        dataset."""
         # Generate a candidate set of predicates.
         logging.info("Generating candidate predicates...")
         grammar = _create_grammar(dataset, self._initial_predicates)
         candidates = grammar.generate(
             max_num=CFG.grammar_search_max_predicates)
         logging.info(f"Done: created {len(candidates)} candidates:")
+        self._metrics["grammar_size"] = len(candidates)
         for predicate, cost in candidates.items():
             logging.info(f"{predicate} {cost}")
         # Apply the candidate predicates to the data.
         logging.info("Applying predicates to data...")
-        atom_dataset = utils.create_ground_atom_dataset(
-            dataset.trajectories,
-            set(candidates) | self._initial_predicates)
+
+        # Get the template str for the dataset filename for saving
+        # a ground atom dataset.
+        dataset_fname, _ = utils.create_dataset_filename_str(True)
+        # Add a bunch of things relevant to grammar search to the
+        # dataset filename string.
+        dataset_fname = dataset_fname[:-5] + \
+            f"_{CFG.grammar_search_max_predicates}" + \
+            f"_{CFG.grammar_search_grammar_includes_givens}" + \
+            f"_{CFG.grammar_search_grammar_includes_foralls}" + \
+            f"_{CFG.grammar_search_grammar_use_diff_features}" + \
+            f"_{CFG.grammar_search_use_handcoded_debug_grammar}" + \
+            dataset_fname[-5:]
+
+        # Load pre-saved data if the CFG.load_atoms flag is set.
+        atom_dataset: Optional[List[GroundAtomTrajectory]] = None
+        if CFG.load_atoms:
+            atom_dataset = utils.load_ground_atom_dataset(
+                dataset_fname, dataset.trajectories)
+        else:
+            atom_dataset = utils.create_ground_atom_dataset(
+                dataset.trajectories,
+                set(candidates) | self._initial_predicates)
+            # Save this atoms dataset if the save_atoms flag is set.
+            if CFG.save_atoms:
+                utils.save_ground_atom_dataset(atom_dataset, dataset_fname)
         logging.info("Done.")
+        assert atom_dataset is not None
+        return (atom_dataset, candidates)
+
+    def _parse_atom_dataset_from_annotated_dataset(
+        self, dataset: Dataset
+    ) -> Tuple[List[GroundAtomTrajectory], Dict[Predicate, float]]:
+        """Uses a dataset with annotations to create a candidate predicate set
+        and atoms trajectories."""
+        # We rely on the annotations as our ground atom datasets.
+        assert dataset.annotations is not None
+        # We now turn these into GroundAtomTrajectories.
+        atom_dataset = []
+        for traj, atoms in zip(dataset.trajectories, dataset.annotations):
+            atom_dataset.append((traj, atoms))
+        # Also generate the grammar by ripping out all the Predicates
+        # associated with each of the atoms in our sets.
+        candidates = {}
+        for ano_traj in dataset.annotations:
+            for ground_atom_state in ano_traj:
+                for ground_atom in ground_atom_state:
+                    assert isinstance(ground_atom, GroundAtom)
+                    if ground_atom.predicate not in candidates:
+                        # The cost of this predicate is simply its arity.
+                        candidates[ground_atom.predicate] = float(
+                            len(ground_atom.objects))
+        logging.debug(f"All candidate predicates: {candidates.keys()}")
+        return (atom_dataset, candidates)
+
+    def learn_from_offline_dataset(self, dataset: Dataset) -> None:
+        if not CFG.offline_data_method == "demo+labelled_atoms":
+            atom_dataset, candidates = self._generate_atom_dataset_via_grammar(
+                dataset)
+        else:
+            # In this case, we're inventing over already-labelled atoms.
+            atom_dataset, candidates = \
+                self._parse_atom_dataset_from_annotated_dataset(dataset)
         # Select a subset of the candidates to keep.
         logging.info("Selecting a subset...")
         if CFG.grammar_search_pred_selection_approach == "score_optimization":
diff --git a/predicators/approaches/llm_open_loop_approach.py b/predicators/approaches/llm_open_loop_approach.py
index 18687b5b47..d438420e67 100644
--- a/predicators/approaches/llm_open_loop_approach.py
+++ b/predicators/approaches/llm_open_loop_approach.py
@@ -38,8 +38,8 @@
 from predicators.approaches import ApproachFailure
 from predicators.approaches.nsrt_metacontroller_approach import \
     NSRTMetacontrollerApproach
-from predicators.llm_interface import OpenAILLM
 from predicators.planning import task_plan_with_option_plan_constraint
+from predicators.pretrained_model_interface import OpenAILLM
 from predicators.settings import CFG
 from predicators.structs import Box, Dataset, GroundAtom, Object, \
     ParameterizedOption, Predicate, State, Task, Type, _GroundNSRT, _Option
@@ -98,6 +98,7 @@ def _get_llm_based_option_plans(
         # Query the LLM.
         llm_predictions = self._llm.sample_completions(
             prompt=prompt,
+            imgs=None,
             temperature=CFG.llm_temperature,
             seed=CFG.seed,
             num_completions=CFG.llm_num_completions)
diff --git a/predicators/approaches/nsrt_learning_approach.py b/predicators/approaches/nsrt_learning_approach.py
index 0b203cbca6..5594a2ff60 100644
--- a/predicators/approaches/nsrt_learning_approach.py
+++ b/predicators/approaches/nsrt_learning_approach.py
@@ -5,7 +5,6 @@
 """
 
 import logging
-import os
 import time
 from typing import Any, Dict, List, Optional, Set
 
@@ -68,42 +67,13 @@ def _learn_nsrts(self, trajectories: List[LowLevelTrajectory],
         # options take many steps, this makes a big time/space difference.
         ground_atom_dataset: Optional[List[GroundAtomTrajectory]] = None
         if CFG.load_atoms:
-            os.makedirs(CFG.data_dir, exist_ok=True)
-            # Check that the dataset file was previously saved.
-            if os.path.exists(dataset_fname):
-                # Load the ground atoms dataset.
-                with open(dataset_fname, "rb") as f:
-                    ground_atom_dataset_atoms = pkl.load(f)
-                assert len(trajectories) == len(ground_atom_dataset_atoms)
-                logging.info("\n\nLOADED GROUND ATOM DATASET")
-
-                # The saved ground atom dataset consists only of sequences
-                # of sets of GroundAtoms, we need to recombine this with
-                # the LowLevelTrajectories to create a GroundAtomTrajectory.
-                ground_atom_dataset = []
-                for i, traj in enumerate(trajectories):
-                    ground_atom_seq = ground_atom_dataset_atoms[i]
-                    ground_atom_dataset.append(
-                        (traj, [set(atoms) for atoms in ground_atom_seq]))
-            else:
-                raise ValueError(f"Cannot load ground atoms: {dataset_fname}")
+            ground_atom_dataset = utils.load_ground_atom_dataset(
+                dataset_fname, trajectories)
         elif CFG.save_atoms:
             # Apply predicates to data, producing a dataset of abstract states.
             ground_atom_dataset = utils.create_ground_atom_dataset(
                 trajectories, self._get_current_predicates())
-            # Save ground atoms dataset to file. Note that a
-            # GroundAtomTrajectory contains a normal LowLevelTrajectory and a
-            # list of sets of GroundAtoms, so we only save the list of
-            # GroundAtoms (the LowLevelTrajectories are saved separately).
-            ground_atom_dataset_to_pkl = []
-            for gt_traj in ground_atom_dataset:
-                trajectory = []
-                for ground_atom_set in gt_traj[1]:
-                    trajectory.append(ground_atom_set)
-                ground_atom_dataset_to_pkl.append(trajectory)
-            with open(dataset_fname, "wb") as f:
-                pkl.dump(ground_atom_dataset_to_pkl, f)
-
+            utils.save_ground_atom_dataset(ground_atom_dataset, dataset_fname)
         self._nsrts, self._segmented_trajs, self._seg_to_nsrt = \
             learn_nsrts_from_data(trajectories,
                                   self._train_tasks,
diff --git a/predicators/datasets/__init__.py b/predicators/datasets/__init__.py
index 7ab65520da..6a5159886d 100644
--- a/predicators/datasets/__init__.py
+++ b/predicators/datasets/__init__.py
@@ -6,6 +6,9 @@
 from predicators import utils
 from predicators.datasets.demo_only import create_demo_data
 from predicators.datasets.demo_replay import create_demo_replay_data
+from predicators.datasets.generate_atom_trajs_with_vlm import \
+    create_ground_atom_data_from_img_trajs, \
+    create_ground_atom_data_from_labelled_txt
 from predicators.datasets.ground_atom_data import create_ground_atom_data
 from predicators.envs import BaseEnv
 from predicators.settings import CFG
@@ -40,6 +43,16 @@ def create_dataset(env: BaseEnv, train_tasks: List[Task],
         n = int(CFG.teacher_dataset_num_examples)
         assert n >= 1, "Must have at least 1 example of each predicate"
         return create_ground_atom_data(env, base_dataset, excluded_preds, n)
+    if CFG.offline_data_method == "demo+labelled_atoms":
+        return create_ground_atom_data_from_labelled_txt(
+            env, train_tasks, known_options)
+    if CFG.offline_data_method == "img_demos":  # pragma: no cover.
+        # NOTE: this below method is tested separately; it's just that testing
+        # it by calling the above function is painful because a VLM is
+        # instantiated and called from inside this method, but when testing,
+        # we want to instantiate our own 'dummy' VLM.
+        return create_ground_atom_data_from_img_trajs(env, train_tasks,
+                                                      known_options)
     if CFG.offline_data_method == "empty":
         return Dataset([])
     raise NotImplementedError("Unrecognized dataset method.")
diff --git a/predicators/datasets/generate_atom_trajs_with_vlm.py b/predicators/datasets/generate_atom_trajs_with_vlm.py
new file mode 100644
index 0000000000..5831696357
--- /dev/null
+++ b/predicators/datasets/generate_atom_trajs_with_vlm.py
@@ -0,0 +1,713 @@
+"""Functions to create offline demonstration data by leveraging VLMs."""
+
+import ast
+import glob
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Set, Tuple
+
+import numpy as np
+import PIL.Image
+
+from predicators import utils
+from predicators.envs import BaseEnv
+from predicators.envs.vlm_envs import DUMMY_GOAL_OBJ_NAME, VLMPredicateEnv
+from predicators.pretrained_model_interface import GoogleGeminiVLM, \
+    VisionLanguageModel
+from predicators.settings import CFG
+from predicators.structs import Action, Dataset, GroundAtom, \
+    ImageOptionTrajectory, LowLevelTrajectory, Object, ParameterizedOption, \
+    Predicate, State, Task, _Option
+
+
+def _generate_prompt_for_atom_proposals(
+        traj: ImageOptionTrajectory, trajectory_subsample_freq: int
+) -> List[Tuple[str, List[PIL.Image.Image]]]:
+    """Prompt(s) for generating proposals for atoms. Note that this generates a
+    sequence of multiple prompts for a given trajectory that will then be sent
+    to the VLM in one single chat session.
+
+    Note that all our prompts are saved as separate txt files under the
+    'vlm_input_data_prompts/atom_proposals' folder.
+    """
+    ret_list = []
+    filepath_prefix = utils.get_path_to_predicators_root() + \
+        "/predicators/datasets/vlm_input_data_prompts/atom_proposal/"
+    try:
+        with open(filepath_prefix +
+                  CFG.grammar_search_vlm_atom_proposal_prompt_type + ".txt",
+                  "r",
+                  encoding="utf-8") as f:
+            prompt = f.read()
+    except FileNotFoundError:
+        raise ValueError("Unknown VLM prompting option " +
+                         f"{CFG.grammar_search_vlm_atom_proposal_prompt_type}")
+    prompt = prompt.format(objs=[
+        str(obj.name) for obj in sorted(traj.objects)
+        if obj.name != 'dummy_goal_obj'
+    ])
+
+    if CFG.grammar_search_vlm_atom_proposal_prompt_type == "naive_each_step":
+        i = 0
+        while i < len(traj.imgs):
+            ret_list.append((prompt, traj.imgs[i]))
+            i += trajectory_subsample_freq
+    elif CFG.grammar_search_vlm_atom_proposal_prompt_type == \
+        "naive_whole_traj":
+        # NOTE: we rip out just one img from each of the state images.
+        # This is fine/works for the case where we only have one
+        # camera view, but probably will need to be amended in the future!
+        ret_list.append(
+            (prompt, [traj.imgs[i][0] for i in range(len(traj.imgs))]))
+    elif CFG.grammar_search_vlm_atom_proposal_prompt_type == \
+        "options_labels_whole_traj":
+        prompt += "\n".join(act.name + str(sorted(act.objects))
+                            for act in traj.actions)
+        # NOTE: exact same issue as described in the above note for
+        # naive_whole_traj.
+        ret_list.append(
+            (prompt, [traj.imgs[i][0] for i in range(len(traj.imgs))]))
+
+    return ret_list
+
+
+def _generate_prompt_for_scene_labelling(
+        traj: ImageOptionTrajectory,
+        atoms_list: List[str]) -> List[Tuple[str, List[PIL.Image.Image]]]:
+    """Prompt for generating labels for an entire trajectory. Similar to the
+    above prompting method, this outputs a list of prompts to label the state
+    at each timestep of traj with atom values).
+
+    Note that all our prompts are saved as separate txt files under the
+    'vlm_input_data_prompts/atom_labelling' folder.
+    """
+    ret_list = []
+    filepath_prefix = utils.get_path_to_predicators_root() + \
+        "/predicators/datasets/vlm_input_data_prompts/atom_labelling/"
+    try:
+        with open(filepath_prefix +
+                  CFG.grammar_search_vlm_atom_label_prompt_type + ".txt",
+                  "r",
+                  encoding="utf-8") as f:
+            prompt = f.read()
+    except FileNotFoundError:
+        raise ValueError("Unknown VLM prompting option " +
+                         f"{CFG.grammar_search_vlm_atom_label_prompt_type}")
+    for atom_str in atoms_list:
+        prompt += f"\n{atom_str}"
+    for curr_imgs in traj.imgs:
+        # NOTE: we rip out just one img from each of the state
+        # images. This is fine/works for the case where we only
+        # have one camera view, but probably will need to be
+        # amended in the future!
+        ret_list.append((prompt, [curr_imgs[0]]))
+    return ret_list
+
+
+def _sample_vlm_atom_proposals_from_trajectories(
+        trajectories: List[ImageOptionTrajectory],
+        vlm: VisionLanguageModel,
+        trajectory_subsample_freq: int = 1) -> List[List[str]]:
+    """Given a list of ImageOptionTrajectories, query a VLM to generate a list
+    of names of ground atoms from which we can extract predicates that might be
+    relevant for planning to recreate these trajectories."""
+    aggregated_vlm_output_strs = []
+    all_vlm_queries_list = []
+    for traj in trajectories:
+        all_vlm_queries_list += _generate_prompt_for_atom_proposals(
+            traj, trajectory_subsample_freq)
+    curr_num_queries = 0
+    total_num_queries = len(all_vlm_queries_list)
+    for txt_prompt, img_prompt in all_vlm_queries_list:
+        aggregated_vlm_output_strs.append(
+            vlm.sample_completions(txt_prompt,
+                                   img_prompt,
+                                   0.0,
+                                   CFG.seed,
+                                   num_completions=1))
+        curr_num_queries += 1
+        logging.info("Completed (%s/%s) init atoms queries to the VLM.",
+                     curr_num_queries, total_num_queries)
+    return aggregated_vlm_output_strs
+
+
+def _label_trajectories_with_vlm_atom_values(
+        trajectories: List[ImageOptionTrajectory], vlm: VisionLanguageModel,
+        atoms_list: List[str]) -> List[List[str]]:
+    """Given a list of atoms, label every state in ImageOptionTrajectories with
+    the truth values of a set of atoms."""
+    total_scenes_to_label = sum(len(traj.imgs) for traj in trajectories)
+    curr_scenes_labelled = 0
+    output_labelled_atoms_txt_list = []
+    for traj in trajectories:
+        prompts_for_traj = _generate_prompt_for_scene_labelling(
+            traj, atoms_list)
+        curr_traj_txt_outputs = []
+        for text_prompt, img_prompt in prompts_for_traj:
+            # Sample VLM outputs with temperature 0 in an attempt to be
+            # accurate.
+            curr_vlm_atom_labelling = vlm.sample_completions(text_prompt,
+                                                             img_prompt,
+                                                             0.0,
+                                                             CFG.seed,
+                                                             num_completions=1)
+            assert len(curr_vlm_atom_labelling) == 1
+            sanitized_output = curr_vlm_atom_labelling[0].replace('\\', '')
+            curr_traj_txt_outputs.append(sanitized_output)
+            curr_scenes_labelled += 1
+            logging.info("Completed (%s/%s) label queries to VLM!",
+                         curr_scenes_labelled, total_scenes_to_label)
+        output_labelled_atoms_txt_list.append(curr_traj_txt_outputs)
+    return output_labelled_atoms_txt_list
+
+
+def _parse_unique_atom_proposals_from_list(
+        atom_strs_proposals_list: List[List[str]],
+        relevant_objects_across_demos: Set[Object]) -> Set[str]:
+    """Given a list of atom proposals that a VLM has constructed for each
+    demonstration, parse these to a unique set of proposals.
+
+    This function currently does 3 steps of sanitization: (1) removing
+    any unnecessary characters, (2) removing any atoms that involve
+    objects that aren't known, (3) removing any duplicate atoms.
+    """
+    atoms_strs_set = set()
+    obj_names_set = set(obj.name for obj in relevant_objects_across_demos)
+    num_atoms_considered = 0
+    for atoms_proposal_for_traj in atom_strs_proposals_list:
+        assert len(atoms_proposal_for_traj) == 1
+        curr_atoms_proposal = atoms_proposal_for_traj[0]
+        # Regex pattern to match predicates
+        atom_match_pattern = r"\b[a-z_]+\([a-z0-9, ]+\)"
+        # Find all matches in the text
+        matches = re.findall(atom_match_pattern,
+                             curr_atoms_proposal,
+                             flags=re.IGNORECASE)
+        for atom_proposal_txt in matches:
+            num_atoms_considered += 1
+            atom_is_valid = True
+            atom = re.sub(r"[^\w\s\(\),]", "", atom_proposal_txt).strip(' ')
+            obj_names = re.findall(r'\((.*?)\)', atom)
+            if obj_names:
+                obj_names_list = [
+                    name.strip() for name in obj_names[0].split(',')
+                ]
+                for obj_name in obj_names_list:
+                    if obj_name not in obj_names_set:
+                        atom_is_valid = False
+                        break
+            if atom_is_valid:
+                atoms_strs_set.add(atom)
+            logging.debug(f"Proposed atom: {atom} is valid: {atom_is_valid}")
+    logging.info(f"VLM proposed a total of {num_atoms_considered} atoms.")
+    logging.info(f"Of these, {len(atoms_strs_set)} were valid and unique.")
+    return atoms_strs_set
+
+
+def save_labelled_trajs_as_txt(
+        env: BaseEnv, labelled_atoms_trajs: List[List[str]],
+        ground_option_trajs: List[List[_Option]]) -> None:
+    """Save a txt file with a text representation of GroundAtomTrajectories.
+
+    This serves as a human-readable intermediary output for debugging,
+    and also as a convenient restart point for the pipeline (i.e., these
+    txt files can be loaded and the rest of the pipeline run from
+    there)!
+    """
+    # All trajectories are delimited between pairs of "===".
+    save_str = "===\n"
+    assert len(labelled_atoms_trajs) == len(ground_option_trajs)
+    for curr_atoms_traj, curr_option_traj in zip(labelled_atoms_trajs,
+                                                 ground_option_trajs):
+        assert len(curr_option_traj) + 1 == len(curr_atoms_traj)
+        for option_ts in range(len(curr_option_traj)):
+            curr_atom_state_str = curr_atoms_traj[option_ts]
+            # Wrap the state in curly brackets.
+            curr_state_str = "{" + curr_atom_state_str + "} ->"
+            curr_option = curr_option_traj[option_ts]
+            curr_option_str = curr_option.name + "("
+            for obj in curr_option.objects:
+                curr_option_str += str(obj.name) + ", "
+            curr_option_str = curr_option_str[:-2] + ")" + str(
+                curr_option.params.tolist()) + " -> "
+            save_str += curr_state_str + "\n\n" + curr_option_str + "\n\n"
+        # At the end of the trajectory, we need to append the final state,
+        # and a "===" delimiter.
+        final_atom_state_str = curr_atoms_traj[-1]
+        final_state_str = "{" + final_atom_state_str + "}\n"
+        save_str += final_state_str + "===\n"
+    # Finally, save this constructed string as a txt file!
+    txt_filename = f"{env.get_name()}__demo+labelled_atoms__manual__" + \
+    f"{len(labelled_atoms_trajs)}.txt"
+    filepath = os.path.join(CFG.data_dir, txt_filename)
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(save_str)
+    logging.info(f"Human-readable labelled trajectory saved to {filepath}!")
+
+
+def _parse_structured_state_into_ground_atoms(
+    env: BaseEnv,
+    train_tasks: List[Task],
+    structured_state_trajs: List[List[Dict[str, Dict[Tuple[str, ...], bool]]]],
+) -> List[List[Set[GroundAtom]]]:
+    """Convert structured state trajectories into actual trajectories of ground
+    atoms."""
+    # We check a number of important properties before starting.
+    # Firstly, the number of train tasks must equal the number of structured
+    # state demos we have.
+    assert len(train_tasks) == len(structured_state_trajs)
+    # Secondly, we assume there is only one goal predicate, and that it is
+    # a dummy goal predicate.
+    assert len(env.goal_predicates) == 1
+    goal_preds_list = list(env.goal_predicates)
+    goal_predicate = goal_preds_list[0]
+    assert goal_predicate.name == "DummyGoal"
+    # We also assume that there is precisely one "object" type that is
+    # a superset of all other object types.
+    obj_type = None
+    for t in env.types:
+        obj_type = t.oldest_ancestor
+        assert obj_type.name == "object"
+    assert obj_type is not None
+
+    def _stripped_classifier(
+            state: State,
+            objects: Sequence[Object]) -> bool:  # pragma: no cover.
+        raise Exception("Stripped classifier should never be called!")
+
+    pred_name_to_pred = {}
+    atoms_trajs = []
+    # Loop through all trajectories in the structured_state_trajs and convert
+    # each one to a sequence of sets of GroundAtoms.
+    for i, traj in enumerate(structured_state_trajs):
+        curr_atoms_traj = []
+        objs_for_task = set(train_tasks[i].init)
+        curr_obj_name_to_obj = {obj.name: obj for obj in objs_for_task}
+        # NOTE: We assume that there is precisely one dummy object that is
+        # used to track whether the dummy goal has been reached or not.
+        assert DUMMY_GOAL_OBJ_NAME in curr_obj_name_to_obj
+        # Create a goal atom for this demonstration using the goal predicate.
+        goal_atom = GroundAtom(goal_predicate,
+                               [curr_obj_name_to_obj[DUMMY_GOAL_OBJ_NAME]])
+        for structured_state in traj:
+            curr_ground_atoms_state = set()
+            for pred_name, objs_and_val_dict in structured_state.items():
+                # IMPORTANT NOTE: this currently assumes that the data is such
+                # that a predicate with a certain name (e.g. "Sliced")
+                # always appears with the same number of object arguments
+                # (e.g. Sliced(apple), and never
+                # Sliced(apple, cutting_tool)). We might want to explicitly
+                # check for this in the future.
+                if pred_name not in pred_name_to_pred:
+                    if len(objs_and_val_dict.keys()) == 1:
+                        # In this case, we make a predicate that takes in
+                        # exactly one types argument.
+                        for obj_args in objs_and_val_dict.keys():
+                            # We need to construct the types being
+                            # fed into this predicate.
+                            pred_types = []
+                            for obj_name in obj_args:
+                                curr_obj = curr_obj_name_to_obj[obj_name]
+                                pred_types.append(curr_obj.type)
+                            pred_name_to_pred[pred_name] = Predicate(
+                                pred_name, pred_types, _stripped_classifier)
+                    else:
+                        # In this case, we need to make a predicate that
+                        # takes in the generic 'object' type such that
+                        # multiple different objs could potentially be
+                        # subbed in.
+                        # Start by checking that the number of object
+                        # args are always the same
+                        num_args = 0
+                        for obj_args in objs_and_val_dict.keys():
+                            if num_args == 0:
+                                num_args = len(obj_args)
+                            else:
+                                assert num_args == len(obj_args)
+                        # Given this, add one new predicate with num_args
+                        # number of 'object' type arguments.
+                        pred_name_to_pred[pred_name] = Predicate(
+                            pred_name, [obj_type for _ in range(num_args)],
+                            _stripped_classifier)
+
+                # Given that we've now built up predicates and object
+                # dictionaries. We can now convert the current state into
+                # ground atoms!
+                for obj_args, truth_value in objs_and_val_dict.items():
+                    if truth_value:
+                        curr_ground_atoms_state.add(
+                            GroundAtom(
+                                pred_name_to_pred[pred_name],
+                                [curr_obj_name_to_obj[o] for o in obj_args]))
+            curr_atoms_traj.append(curr_ground_atoms_state)
+        # Add the goal atom at the end of the trajectory.
+        curr_atoms_traj[-1].add(goal_atom)
+        atoms_trajs.append(curr_atoms_traj)
+    return atoms_trajs
+
+
+def _parse_structured_actions_into_ground_options(
+        structured_actions_trajs: List[List[Tuple[str, Tuple[str, ...],
+                                                  List[float]]]],
+        known_options: Set[ParameterizedOption],
+        train_tasks: List[Task]) -> List[List[_Option]]:
+    """Convert structured actions trajectories into actual lists of ground
+    options trajectories."""
+    assert len(structured_actions_trajs) == len(train_tasks)
+    option_name_to_option = {o.name: o for o in known_options}
+    option_trajs = []
+    for i, traj in enumerate(structured_actions_trajs):
+        curr_obj_name_to_obj = {
+            obj.name: obj
+            for obj in set(train_tasks[i].init)
+        }
+        curr_option_traj = []
+        for structured_action in traj:
+            option = option_name_to_option[structured_action[0]]
+            ground_option = option.ground([
+                curr_obj_name_to_obj[obj_name]
+                for obj_name in structured_action[1]
+            ], np.array(structured_action[2]))
+            # Call initiable here because we will be calling
+            # terminal later, and initiable always needs
+            # to be called first.
+            ground_option.initiable(train_tasks[i].init)
+            curr_option_traj.append(ground_option)
+        option_trajs.append(curr_option_traj)
+    return option_trajs
+
+
+def _create_dummy_goal_state_for_each_task(
+        env: BaseEnv, train_tasks: List[Task]) -> List[State]:
+    """Uses a lot of assumptions to generate a state in which a dummy goal
+    predicate holds for each train task."""
+    # FOR NOW, we assume there is only one goal predicate, and that it is
+    # a dummy goal predicate. In the future, we will implement and use
+    # proper goal predicates.
+    assert len(env.goal_predicates) == 1
+    goal_preds_list = list(env.goal_predicates)
+    goal_predicate = goal_preds_list[0]
+    assert goal_predicate.name == "DummyGoal"
+    goal_states = []
+    for train_task in train_tasks:
+        curr_task_obj_name_to_obj = {obj.name: obj for obj in train_task.init}
+        assert DUMMY_GOAL_OBJ_NAME in curr_task_obj_name_to_obj
+        dummy_goal_feats = curr_task_obj_name_to_obj[
+            DUMMY_GOAL_OBJ_NAME].type.feature_names
+        assert len(dummy_goal_feats) == 1
+        assert dummy_goal_feats[0] == "goal_true"
+        curr_task_goal_atom = GroundAtom(
+            goal_predicate, [curr_task_obj_name_to_obj[DUMMY_GOAL_OBJ_NAME]])
+        assert not curr_task_goal_atom.holds(train_task.init)
+        curr_goal_state = train_task.init.copy()
+        curr_goal_state.set(curr_task_obj_name_to_obj[DUMMY_GOAL_OBJ_NAME],
+                            "goal_true", 1.0)
+        assert curr_task_goal_atom.holds(curr_goal_state)
+        goal_states.append(curr_goal_state)
+    return goal_states
+
+
+def _convert_ground_option_trajs_into_lowleveltrajs(
+        option_trajs: List[List[_Option]], dummy_goal_states: List[State],
+        train_tasks: List[Task]) -> List[LowLevelTrajectory]:
+    """Convert option trajectories into LowLevelTrajectories to be used in
+    constructing a Dataset."""
+    assert len(option_trajs) == len(dummy_goal_states) == len(train_tasks)
+    # NOTE: In this LowLevelTrajectory, we assume the low level states
+    # are the same as the init state until the final state.
+    trajs = []
+    for traj_num in range(len(option_trajs)):
+        traj_init_state = train_tasks[traj_num].init
+        curr_traj_states = []
+        curr_traj_actions = []
+        for idx_within_traj in range(len(option_trajs[traj_num])):
+            curr_traj_states.append(traj_init_state)
+            curr_traj_actions.append(
+                Action(np.zeros(0, dtype=float),
+                       option_trajs[traj_num][idx_within_traj]))
+        # Now, we need to append the final state because there are 1 more
+        # states than actions.
+        curr_traj_states.append(dummy_goal_states[traj_num])
+        curr_traj = LowLevelTrajectory(curr_traj_states, curr_traj_actions,
+                                       True, traj_num)
+        trajs.append(curr_traj)
+    return trajs
+
+
+def _debug_log_atoms_trajs(
+        ground_atoms_trajs: List[List[Set[GroundAtom]]]) -> None:
+    """Debug log the changes in atoms trajectories for easy human-checking."""
+    # Log trajectory information in a very easy to parse format for
+    # debugging.
+    for traj in ground_atoms_trajs:
+        logging.debug(f"Step 0 atoms: {sorted(traj[0])}")
+        for i in range(1, len(traj)):
+            logging.debug(f"Step {i} add effs: {sorted(traj[i] - traj[i-1])}")
+            logging.debug(f"Step {i} del effs: {sorted(traj[i-1] - traj[i])}")
+        logging.debug("\n")
+
+
+def _parse_options_txt_into_structured_actions(
+        text: str) -> List[Tuple[str, Tuple[str, ...], List[float]]]:
+    """Given text that contains a series of ground options convert this into a
+    structured set of tuples suitable for later conversion into more structured
+    GroundAtomTrajectories."""
+    structured_actions_output = []
+    pattern_option = r'(\w+)\(([^)]*)\)\[([\d.,\s]*)\] ->'
+    option_matches = re.findall(pattern_option, text)
+    for i in range(len(option_matches)):
+        current_option_with_objs = (option_matches[i][0],
+                                    tuple(
+                                        map(str.strip,
+                                            option_matches[i][1].split(','))))
+        continuous_params_floats = [
+            float(float_str.strip(' '))
+            for float_str in option_matches[i][2].split(',')
+            if len(float_str) > 0
+        ]
+        structured_actions_output.append(
+            (current_option_with_objs[0], current_option_with_objs[1],
+             continuous_params_floats))
+    return structured_actions_output
+
+
+def _parse_atoms_txt_into_structured_state(
+        text: str) -> List[Dict[str, Dict[Tuple[str, ...], bool]]]:
+    """Given text that contains a series of ground atoms labelled with their
+    specific truth values, convert this into a structured dictionary suitable
+    for later conversion into more structured GroundAtomTrajectories."""
+    pattern_block_of_state = r"\{(.*?[^\d,\s].*?)\}"
+    pattern_predicate = r'(\w+)\(([^)]+)\): (\w+).'
+    state_blocks_matches = re.findall(pattern_block_of_state, text, re.DOTALL)
+    structured_state_output = []
+    for state_block_match in state_blocks_matches:
+        predicate_matches_within_state_block = re.findall(
+            pattern_predicate, state_block_match)
+        current_predicate_data: Dict[str, Dict[Tuple[str, ...], bool]] = {}
+        for predicate_match in predicate_matches_within_state_block:
+            classifier_name = predicate_match[0]
+            objects = tuple(map(str.strip, predicate_match[1].split(',')))
+            truth_value = predicate_match[2] == 'True'
+            if classifier_name not in current_predicate_data:
+                current_predicate_data[classifier_name] = {}
+            current_predicate_data[classifier_name][objects] = truth_value
+        structured_state_output.append(current_predicate_data.copy())
+    return structured_state_output
+
+
+def _parse_vlmtraj_into_structured_traj(
+    text: str
+) -> Tuple[List[Dict[str, Dict[Tuple[str, ...], bool]]], List[Tuple[str, Tuple[
+        str, ...], List[float]]]]:
+    """Parse a handwritten trajectory saved as text into a structured
+    representation that can be used to convert these into a more structured
+    description suitable for later conversion into GroundAtomTrajectories.
+
+    This function outputs two lists. The first contains a dictionary
+    whose keys are names of predicates, and whose values are a dict
+    mapping a tuple of object names to a boolean value for the ground
+    predicate at this particular timestep. The second contains a tuple
+    whose first element is the current option name, and the second
+    element contains all the objects used by this option.
+    """
+    structured_state = _parse_atoms_txt_into_structured_state(text)
+    structured_actions = _parse_options_txt_into_structured_actions(text)
+    assert len(structured_state) == len(
+        structured_actions
+    ) + 1, "Manual data malformed; num states != 1 + num options."
+    return (structured_state, structured_actions)
+
+
+def _parse_vlmtraj_file_into_structured_trajs(
+    filename: str
+) -> Tuple[List[List[Dict[str, Dict[Tuple[str, ...], bool]]]], List[List[Tuple[
+        str, Tuple[str, ...], List[float]]]]]:
+    """Parse a txt file full of handwritten trajectories into a structured
+    representation that can be used to convert these into
+    GroundAtomTrajectories suitable for predicate invention, operator learning,
+    etc.
+
+    We assume the vlmtraj is saved in a txt file with an encoding scheme
+    described in:
+    `approaches/documentation/grammar_search_invention_approach.md`.
+    This function outputs two lists of lists, where each element is the output
+    of the above parse_handmade_vlmtraj_into_structured_traj function.
+    """
+    with open(filename, "r", encoding="utf8") as f:
+        full_file_text = f.read()
+    pattern = r"(?<====\n)(.*?)(?=\n===)"
+    matches = re.findall(pattern, full_file_text, re.DOTALL)
+    output_state_trajs, output_action_trajs = [], []
+    for match in matches:
+        curr_state_traj, curr_action_traj = _parse_vlmtraj_into_structured_traj(
+            match)
+        output_state_trajs.append(curr_state_traj)
+        output_action_trajs.append(curr_action_traj)
+    return (output_state_trajs, output_action_trajs)
+
+
+def create_ground_atom_data_from_labelled_txt(
+        env: BaseEnv, train_tasks: List[Task],
+        known_options: Set[ParameterizedOption]) -> Dataset:
+    """Given a txt file containing trajectories labelled with VLM predicate
+    values, construct a dataset that can be passed to the rest of our learning
+    pipeline."""
+    dataset_fpath = os.path.join(CFG.data_dir, CFG.handmade_demo_filename)
+    # First, parse this dataset into a structured form.
+    structured_states, structured_actions = \
+        _parse_vlmtraj_file_into_structured_trajs(dataset_fpath)
+    assert len(structured_states) == len(structured_actions)
+    # Next, take this intermediate structured form and further
+    # parse it into ground atoms and ground options respectively.
+    ground_atoms_trajs = _parse_structured_state_into_ground_atoms(
+        env, train_tasks, structured_states)
+    _debug_log_atoms_trajs(ground_atoms_trajs)
+    option_trajs = _parse_structured_actions_into_ground_options(
+        structured_actions, known_options, train_tasks)
+    # We also need to create the goal state for every train task.
+    goal_states_for_every_traj = _create_dummy_goal_state_for_each_task(
+        env, train_tasks)
+    # Finally, we need to construct actual LowLevelTrajectories.
+    low_level_trajs = _convert_ground_option_trajs_into_lowleveltrajs(
+        option_trajs, goal_states_for_every_traj, train_tasks)
+    return Dataset(low_level_trajs, ground_atoms_trajs)
+
+
+def create_ground_atom_data_from_img_trajs(
+        env: BaseEnv,
+        train_tasks: List[Task],
+        known_options: Set[ParameterizedOption],
+        vlm: Optional[VisionLanguageModel] = None) -> Dataset:
+    """Given a folder containing trajectories that have images of scenes for
+    each state, as well as options that transition between these states, output
+    a dataset."""
+    trajectories_folder_path = os.path.join(
+        utils.get_path_to_predicators_root(), CFG.data_dir,
+        CFG.vlm_trajs_folder_name)
+    # First, run some checks on the folder name to make sure
+    # we're not accidentally loading the wrong one.
+    folder_name_components = CFG.vlm_trajs_folder_name.split('__')
+    assert folder_name_components[0] == CFG.env
+    assert folder_name_components[1] == "vlm_demos"
+    assert int(folder_name_components[2]) == CFG.seed
+    assert int(folder_name_components[3]) == CFG.num_train_tasks
+    num_trajs = len(os.listdir(trajectories_folder_path))
+    assert num_trajs == CFG.num_train_tasks
+    option_name_to_option = {opt.name: opt for opt in known_options}
+    image_option_trajs = []
+    all_task_objs = set()
+    for train_task_idx, path in enumerate(
+            sorted(Path(trajectories_folder_path).iterdir())):
+        assert path.is_dir()
+        state_folders = [f.path for f in os.scandir(path) if f.is_dir()]
+        num_states_in_traj = len(state_folders)
+        state_traj = []
+        for state_num in range(num_states_in_traj):
+            curr_imgs = []
+            curr_state_path = path.joinpath(str(state_num))
+            # NOTE: we assume all images are saved as jpg files.
+            img_files = sorted(glob.glob(str(curr_state_path) + "/*.jpg"))
+            for img in img_files:
+                curr_imgs.append(PIL.Image.open(img))
+            state_traj.append(curr_imgs)
+        # Get objects from train tasks to be used for future parsing.
+        curr_train_task = train_tasks[train_task_idx]
+        curr_task_objs = set(curr_train_task.init)
+        all_task_objs |= curr_task_objs
+        curr_task_obj_name_to_obj = {obj.name: obj for obj in curr_task_objs}
+        # Parse out actions for the trajectory.
+        options_traj_file_list = glob.glob(str(path) + "/*.txt")
+        assert len(options_traj_file_list) == 1
+        options_traj_file = options_traj_file_list[0]
+        with open(options_traj_file, "r", encoding="utf-8") as f:
+            options_file_str = f.read()
+        option_names_list = re.findall(r'(\w+)\(', options_file_str)
+        parsed_str_objects = re.findall(r'\((.*?)\)', options_file_str)
+        object_args_list = [obj.split(', ') for obj in parsed_str_objects]
+        # Remove empty square brackets from the object_args_list.
+        for object_arg_sublist in object_args_list:
+            object_arg_sublist.remove('[]')
+        parameters = [
+            ast.literal_eval(obj) if obj else []
+            for obj in re.findall(r'\[(.*?)\]', options_file_str)
+        ]
+        ground_option_traj: List[_Option] = []
+        # Now actually create ground options.
+        for option_name, option_objs_strs_list, option_params in zip(
+                option_names_list, object_args_list, parameters):
+            objects = [
+                curr_task_obj_name_to_obj[opt_arg]
+                for opt_arg in option_objs_strs_list
+            ]
+            option = option_name_to_option[option_name]
+            ground_option = option.ground(objects, np.array(option_params))
+            # NOTE: we assert the option was initiable in the env's initial
+            # state because during learning, we will assert that the option's
+            # initiable function was previously called.
+            assert ground_option.initiable(curr_train_task.init)
+            ground_option_traj.append(ground_option)
+        # Given ground options, we can finally make ImageOptionTrajectories.
+        image_option_trajs.append(
+            ImageOptionTrajectory(list(curr_task_objs), state_traj,
+                                  ground_option_traj, True, train_task_idx))
+    # Given trajectories, we can now query the VLM to get proposals for ground
+    # atoms that might be relevant to decision-making.
+    if vlm is None:
+        vlm = GoogleGeminiVLM(CFG.vlm_model_name)  # pragma: no cover.
+
+    if not CFG.grammar_search_vlm_atom_proposal_use_debug:
+        logging.info("Querying VLM for candidate atom proposals...")
+        atom_strs_proposals_list = _sample_vlm_atom_proposals_from_trajectories(
+            image_option_trajs, vlm, 1)
+        logging.info("Done querying VLM for candidate atoms!")
+        # We now parse and sanitize this set of atoms.
+        atom_proposals_set = _parse_unique_atom_proposals_from_list(
+            atom_strs_proposals_list, all_task_objs)
+    else:  # pragma: no cover.
+        assert isinstance(env, VLMPredicateEnv)
+        atom_proposals_set = env.vlm_debug_atom_strs
+    assert len(atom_proposals_set) > 0, "Atom proposals set is empty!"
+    # Given this set of unique atom proposals, we now ask the VLM
+    # to label these in every scene from the demonstrations.
+    # NOTE: we convert to a sorted list here to get rid of randomness from set
+    # ordering.
+    unique_atoms_list = sorted(atom_proposals_set)
+    # Now, query the VLM!
+    logging.info("Querying VLM to label every scene...")
+    atom_labels = _label_trajectories_with_vlm_atom_values(
+        image_option_trajs, vlm, unique_atoms_list)
+    logging.info("Done querying VLM for scene labelling!")
+    # Save the output as a human-readable txt file.
+    save_labelled_trajs_as_txt(
+        env, atom_labels, [io_traj.actions for io_traj in image_option_trajs])
+    # Now, parse this information into a Dataset!
+    # Start by converting all the labelled atoms into a more structured
+    # dict. This requires each set of labelled atoms text to be enclosed
+    # by curly brackets.
+    structured_state_trajs = []
+    for atom_traj in atom_labels:
+        atoms_txt_strs = [
+            '{' + curr_ts_atoms_txt + '}' for curr_ts_atoms_txt in atom_traj
+        ]
+        full_traj_atoms_str = '\n\n'.join(atoms_txt_strs)
+        structured_state_trajs.append(
+            _parse_atoms_txt_into_structured_state(full_traj_atoms_str))
+    # Given this, we now convert each trajectory consisting of a series of
+    # structured states into a trajectory of GroundAtoms.
+    ground_atoms_trajs = _parse_structured_state_into_ground_atoms(
+        env, train_tasks, structured_state_trajs)
+    _debug_log_atoms_trajs(ground_atoms_trajs)
+    # Now, we just need to create a goal state for every train task where
+    # the dummy goal predicate holds. This is just bookkeeping necessary
+    # for NSRT learning and planning such that the goal doesn't hold
+    # in the initial state and holds in the final state of each demonstration
+    # trajectory.
+    goal_states_for_every_traj = _create_dummy_goal_state_for_each_task(
+        env, train_tasks)
+    # Finally, we need to construct actual LowLevelTrajectories.
+    low_level_trajs = _convert_ground_option_trajs_into_lowleveltrajs(
+        [traj.actions for traj in image_option_trajs],
+        goal_states_for_every_traj, train_tasks)
+    return Dataset(low_level_trajs, ground_atoms_trajs)
diff --git a/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_cot.txt b/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_cot.txt
new file mode 100644
index 0000000000..74e1e05ac1
--- /dev/null
+++ b/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_cot.txt
@@ -0,0 +1,2 @@
+You are a vision system for a robot. Your job is to output the values of the following predicates based on the provided visual scene. For each predicate, output True, False, or Unknown if the relevant objects are not in the scene or the value of the predicate simply cannot be determined. Output each predicate value as a bulleted list with each predicate and value on a different line. For each output value, provide an explanation as to why you labelled this predicate as having this particular value.Use the format: <predicate>: <truth_value>. <explanation>.
+Predicates:
\ No newline at end of file
diff --git a/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_naive.txt b/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_naive.txt
new file mode 100644
index 0000000000..45ecfebcd6
--- /dev/null
+++ b/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_naive.txt
@@ -0,0 +1,2 @@
+You are a vision system for a robot. Your job is to output the values of the following predicates based on the provided visual scene. For each predicate, output True, False, or Unknown if the relevant objects are not in the scene or the value of the predicate simply cannot be determined. Output each predicate value as a bulleted list with each predicate and value on a different line. Use the format: <predicate>: <truth_value>. Ensure there is a period ('.') after every list item. Do not output any text except the names and truth values of predicates.
+Predicates:
\ No newline at end of file
diff --git a/predicators/datasets/vlm_input_data_prompts/atom_proposal/naive_each_step.txt b/predicators/datasets/vlm_input_data_prompts/atom_proposal/naive_each_step.txt
new file mode 100644
index 0000000000..881930e574
--- /dev/null
+++ b/predicators/datasets/vlm_input_data_prompts/atom_proposal/naive_each_step.txt
@@ -0,0 +1 @@
+You are a robotic vision system whose job is to output a structured set of predicates useful for running a task and motion planning system from the following scene. Please provide predicates in terms of the following objects: {objs}. For each predicate, output it in the following format: predicate_name(obj1, obj2, obj3...) (for instance is_sliced(apple), is_not_sliced(apple), etc.). Also, for each predicate you list, list its negation. List as many predicates as you can possibly think of, even if they're only tangentially relevant to what you see in the scene and even if they're false, given the following scene taken from a demonstration for the task.Do not list any other text other than the names and arguments of predicates. List each proposal as a bulleted list item on a separate line.
\ No newline at end of file
diff --git a/predicators/datasets/vlm_input_data_prompts/atom_proposal/naive_whole_traj.txt b/predicators/datasets/vlm_input_data_prompts/atom_proposal/naive_whole_traj.txt
new file mode 100644
index 0000000000..41db3fe343
--- /dev/null
+++ b/predicators/datasets/vlm_input_data_prompts/atom_proposal/naive_whole_traj.txt
@@ -0,0 +1 @@
+You are a robotic vision system whose job is to output a structured set of predicates useful for describing the important concepts from the following demonstration. Please provide predicates in terms of the objects: {objs}. For each predicate, output it in the following format: predicate_name(obj1, obj2, obj3...) (for instance is_sliced(apple), is_not_sliced(apple), etc.). Also, for each predicate you list, list its negation. Generate as many predicates as you can possibly think of, even if they're only tangentially relevant to the task goal: 'make a cup of ice tea'Do not list any other text other than the names and arguments of predicates. List each proposal as a bulleted list item on a separate line.
\ No newline at end of file
diff --git a/predicators/datasets/vlm_input_data_prompts/atom_proposal/options_labels_whole_traj.txt b/predicators/datasets/vlm_input_data_prompts/atom_proposal/options_labels_whole_traj.txt
new file mode 100644
index 0000000000..9532669d5f
--- /dev/null
+++ b/predicators/datasets/vlm_input_data_prompts/atom_proposal/options_labels_whole_traj.txt
@@ -0,0 +1 @@
+You are a robotic vision system whose job is to output a structured set of predicates useful for describing important concepts in the following demonstration of a task. You will be provided with a list of actions used during the task, as well as images of states before and after every action execution. Please provide predicates in terms of the following objects: {objs}. For each predicate, output it in the following format: predicate_name(obj1, obj2, obj3...). Start by generating predicates that change before and after each action. After this, generate any other predicates that perhaps do not change but are still important to describing the demonstration shown.
\ No newline at end of file
diff --git a/predicators/envs/base_env.py b/predicators/envs/base_env.py
index 0dbe69aa06..931c718245 100644
--- a/predicators/envs/base_env.py
+++ b/predicators/envs/base_env.py
@@ -11,7 +11,7 @@
 from gym.spaces import Box
 
 from predicators import utils
-from predicators.llm_interface import OpenAILLM
+from predicators.pretrained_model_interface import OpenAILLM
 from predicators.settings import CFG
 from predicators.structs import Action, DefaultEnvironmentTask, \
     EnvironmentTask, GroundAtom, Object, Observation, Predicate, State, Type, \
@@ -328,6 +328,7 @@ def _parse_language_goal_from_json(
         prompt = prompt_prefix + f"\n# {language_goal}"
         llm = OpenAILLM(CFG.llm_model_name)
         responses = llm.sample_completions(prompt,
+                                           None,
                                            temperature=0.0,
                                            seed=CFG.seed,
                                            stop_token="#")
diff --git a/predicators/envs/pybullet_env.py b/predicators/envs/pybullet_env.py
index 6eb3c2c160..02c6822d33 100644
--- a/predicators/envs/pybullet_env.py
+++ b/predicators/envs/pybullet_env.py
@@ -209,11 +209,6 @@ def render(self,
         # and cannot be used in headless mode.
         del caption  # unused
 
-        if not self.using_gui:
-            raise Exception(
-                "Rendering only works with GUI on. See "
-                "https://github.com/bulletphysics/bullet3/issues/1157")
-
         view_matrix = p.computeViewMatrixFromYawPitchRoll(
             cameraTargetPosition=self._camera_target,
             distance=self._camera_distance,
diff --git a/predicators/envs/stick_button.py b/predicators/envs/stick_button.py
index 84517f36c0..1d9ded8712 100644
--- a/predicators/envs/stick_button.py
+++ b/predicators/envs/stick_button.py
@@ -147,9 +147,9 @@ def simulate(self, state: State, action: Action) -> State:
                 # Check for a collision with the stick holder. The reason that
                 # we only check for a collision here, as opposed to every
                 # timestep, is that we imagine the robot moving down in the z
-                # direction to pick up the stick, at which button it may
-                # collide with the stick holder. On other timesteps, the robot
-                # would be high enough above the holder to avoid collisions.
+                # direction to pick up the stick after it has reached it on the
+                # x-y plane. On other timesteps, the robot would be high enough
+                # above the holder to avoid collisions.
                 if robot_circ.intersects(holder_rect):
                     # No-op in case of collision.
                     return state.copy()
@@ -387,7 +387,6 @@ def object_to_geom(cls, obj: Object, state: State) -> _Geom2D:
                                    width=cls._get_holder_width(),
                                    height=cls.holder_height,
                                    theta=theta)
-        assert obj.is_instance(cls._stick_type)
         theta = state.get(obj, "theta")
         return utils.Rectangle(x=x,
                                y=y,
@@ -416,6 +415,7 @@ def _Pressed_holds(state: State, objects: Sequence[Object]) -> bool:
     @classmethod
     def Above_holds(cls, state: State, objects: Sequence[Object]) -> bool:
         """Public for use by oracle options."""
+        assert len(objects) == 2
         obj1, obj2 = objects
         geom1 = cls.object_to_geom(obj1, state)
         geom2 = cls.object_to_geom(obj2, state)
@@ -473,3 +473,260 @@ def _event_to_action(state: State,
             return Action(np.array([dx, dy, 0.0, -1.0], dtype=np.float32))
 
         return _event_to_action
+
+
+class StickButtonMovementEnv(StickButtonEnv):
+    """An extension to the stick button env that also has movement options (the
+    pick and place options don't implicitly contain movement."""
+
+    # Make x_ub smaller to make predicate invention constant finding easier.
+    x_ub: ClassVar[float] = 6.0
+    rz_x_ub: ClassVar[float] = x_ub
+    # The (x, y) is the bottom left-hand corner of the stick, and theta
+    # is CCW angle in radians, consistent with utils.Rectangle. The tip
+    # x and y correspond to the end of the stick.
+    _stick_type = Type("stick", ["x", "y", "tip_x", "tip_y", "theta", "held"])
+    # We add an attribute for the open/closed status of the robot's gripper.
+    _robot_type = _robot_type = Type("robot", ["x", "y", "theta", "fingers"])
+
+    def __init__(self, use_gui: bool = True) -> None:
+        super().__init__(use_gui)
+
+        self._HandEmpty = Predicate("HandEmpty", [self._robot_type],
+                                    self._HandEmpty_holds_diff_signature)
+
+    def _get_tasks(self, num: int, num_button_lst: List[int],
+                   rng: np.random.Generator) -> List[EnvironmentTask]:
+        tasks = []
+        for _ in range(num):
+            state_dict = {}
+            num_buttons = num_button_lst[rng.choice(len(num_button_lst))]
+            buttons = [
+                Object(f"button{i}", self._button_type)
+                for i in range(num_buttons)
+            ]
+            goal = {GroundAtom(self._Pressed, [p]) for p in buttons}
+            # Sample initial positions for buttons, making sure to keep them
+            # far enough apart from one another.
+            collision_geoms: Set[utils.Circle] = set()
+            radius = self.button_radius + self.init_padding
+            for button in buttons:
+                # Assuming that the dimensions are forgiving enough that
+                # infinite loops are impossible.
+                while True:
+                    x = rng.uniform(self.x_lb + radius, self.x_ub - radius)
+                    y = rng.uniform(self.y_lb + radius, self.y_ub - radius)
+                    geom = utils.Circle(x, y, radius)
+                    # Keep only if no intersections with existing objects.
+                    # Also enforce that the button is clearly on one side
+                    # of the boundary between robot's reachable vs
+                    # unreachable regions to make predicate invention
+                    # easier.
+                    if not any(geom.intersects(g)
+                               for g in collision_geoms) and abs(
+                                   y - self.rz_y_ub) > radius:
+                        break
+                collision_geoms.add(geom)
+                state_dict[button] = {"x": x, "y": y, "pressed": 0.0}
+            # Sample an initial position for the robot, making sure that it
+            # doesn't collide with buttons and that it's in the reachable zone.
+            radius = self.robot_radius + self.init_padding
+            while True:
+                x = rng.uniform(self.rz_x_lb + radius, self.rz_x_ub - radius)
+                y = rng.uniform(self.rz_y_lb + radius, self.rz_y_ub - radius)
+                geom = utils.Circle(x, y, radius)
+                # Keep only if no intersections with existing objects.
+                if not any(geom.intersects(g) for g in collision_geoms):
+                    break
+            collision_geoms.add(geom)
+            if CFG.stick_button_disable_angles:
+                theta = np.pi / 2
+            else:
+                theta = rng.uniform(self.theta_lb, self.theta_ub)
+            # Initialize the robot with open fingers.
+            state_dict[self._robot] = {
+                "x": x,
+                "y": y,
+                "theta": theta,
+                "fingers": 1.0
+            }
+            # Sample the stick, making sure that the origin is in the
+            # reachable zone, and that the stick doesn't collide with anything.
+            radius = self.robot_radius + self.init_padding
+            while True:
+                # The radius here is to prevent the stick from being very
+                # slightly in the reachable zone, but not grabbable.
+                x = rng.uniform(self.rz_x_lb + radius, self.rz_x_ub - radius)
+                y = rng.uniform(self.stick_init_lb, self.stick_init_ub)
+                assert self.rz_y_lb + radius <= y <= self.rz_y_ub - radius
+                if CFG.stick_button_disable_angles:
+                    theta = np.pi / 2
+                else:
+                    theta = rng.uniform(self.theta_lb, self.theta_ub)
+                rect = utils.Rectangle(x, y, self.stick_width,
+                                       self.stick_height, theta)
+                # Keep only if no intersections with existing objects.
+                if not any(rect.intersects(g) for g in collision_geoms):
+                    break
+            tip_rect = self.stick_rect_to_tip_rect(rect)
+            state_dict[self._stick] = {
+                "x": x,
+                "y": y,
+                "tip_x": tip_rect.x,
+                "tip_y": tip_rect.y,
+                "theta": theta,
+                "held": 0.0
+            }
+            # Create the holder for the stick, sampling the position so that it
+            # is somewhere along the long dimension of the stick. To make sure
+            # that the problem is solvable, check that if the stick were
+            # grasped at the lowest reachable position, it would still be
+            # able to press the highest button.
+            max_button_y = max(state_dict[p]["y"] for p in buttons)
+            necessary_reach = max_button_y - self.rz_y_ub
+            while True:
+                # Allow the stick to start in the middle of the holder.
+                x_offset = rng.uniform(-self._get_holder_width(),
+                                       self.stick_width / 2)
+                # Check solvability.
+                # Case 0: If all buttons are within reach, we're all set.
+                if necessary_reach < 0:
+                    break
+                # Case 1: we can grasp the stick from the bottom.
+                if x_offset > 2 * self.robot_radius:
+                    break
+                # Case 2: we can grasp the stick above the holder, but we can
+                # still reach the highest button.
+                min_rel_grasp = x_offset + self._get_holder_width()
+                grasp_to_top = self.stick_width - min_rel_grasp
+                if grasp_to_top > necessary_reach:
+                    break
+            # First orient the rectangle at 0 and then rotate it.
+            # Along the shorter dimension, we want the stick to be in the
+            # center of the holder, so we need to translate the holder's y
+            # position relative to the stick's y position.
+            assert self.holder_height > self.stick_height
+            height_diff = self.holder_height - self.stick_height
+            holder_rect = utils.Rectangle(
+                x=x + x_offset,
+                y=(y - height_diff / 2),
+                width=self._get_holder_width(),
+                height=self.holder_height,
+                theta=0,
+            )
+            holder_rect = holder_rect.rotate_about_point(x, y, theta)
+            state_dict[self._holder] = {
+                "x": holder_rect.x,
+                "y": holder_rect.y,
+                "theta": holder_rect.theta,
+            }
+            init_state = utils.create_state_from_dict(state_dict)
+            task = EnvironmentTask(init_state, goal)
+            tasks.append(task)
+        return tasks
+
+    @staticmethod
+    def _Grasped_holds(state: State, objects: Sequence[Object]) -> bool:
+        robot, stick = objects
+        stick_held = state.get(stick, "held") > 0.5
+        fingers_closed = state.get(robot, "fingers") <= 0.5
+        return stick_held and fingers_closed
+
+    @staticmethod
+    def _HandEmpty_holds_diff_signature(state: State,
+                                        objects: Sequence[Object]) -> bool:
+        robot, = objects
+        return state.get(robot, "fingers") > 0.5
+
+    @property
+    def action_space(self) -> Box:
+        # Normalized dx, dy, dtheta, press, pickplace.
+        return Box(low=-1., high=1., shape=(5, ), dtype=np.float32)
+
+    def simulate(self, state: State, action: Action) -> State:
+        assert self.action_space.contains(action.arr)
+        norm_dx, norm_dy, norm_dtheta, press, pickplace = action.arr
+        # Actions are normalized to [-1, 1]. Denormalize them here.
+        dx = norm_dx * self.max_speed
+        dy = norm_dy * self.max_speed
+        if CFG.stick_button_disable_angles:
+            dtheta = 0.0
+        else:
+            dtheta = norm_dtheta * self.max_angular_speed
+        # Update the robot state.
+        rx = state.get(self._robot, "x")
+        ry = state.get(self._robot, "y")
+        rtheta = state.get(self._robot, "theta")
+        new_rx = rx + dx
+        new_ry = ry + dy
+        new_rtheta = rtheta + dtheta
+        # The robot cannot leave the reachable zone. If it tries to, noop.
+        rad = self.robot_radius
+        if not self.rz_x_lb + rad <= new_rx <= self.rz_x_ub - rad or \
+           not self.rz_y_lb + rad <= new_ry <= self.rz_y_ub - rad:
+            return state.copy()
+        next_state = state.copy()
+        next_state.set(self._robot, "x", new_rx)
+        next_state.set(self._robot, "y", new_ry)
+        next_state.set(self._robot, "theta", new_rtheta)
+        robot_circ = self.object_to_geom(self._robot, next_state)
+
+        # Check if the stick is held. If so, we need to move and rotate it.
+        stick_held = state.get(self._stick, "held") > 0.5
+        fingers_closed = state.get(self._robot, "fingers") <= 0.5
+        stick_rect = self.object_to_geom(self._stick, state)
+        assert isinstance(stick_rect, utils.Rectangle)
+        if stick_held and fingers_closed:
+            if not CFG.stick_button_disable_angles:
+                stick_rect = stick_rect.rotate_about_point(rx, ry, dtheta)
+            stick_rect = utils.Rectangle(x=(stick_rect.x + dx),
+                                         y=(stick_rect.y + dy),
+                                         width=stick_rect.width,
+                                         height=stick_rect.height,
+                                         theta=stick_rect.theta)
+            next_state.set(self._stick, "x", stick_rect.x)
+            next_state.set(self._stick, "y", stick_rect.y)
+            next_state.set(self._stick, "theta", stick_rect.theta)
+
+        if press > 0:
+            # Check if any button is now pressed.
+            tip_rect = self.stick_rect_to_tip_rect(stick_rect)
+            for button in state.get_objects(self._button_type):
+                circ = self.object_to_geom(button, state)
+                if (circ.intersects(tip_rect) and stick_held) or \
+                   (circ.intersects(robot_circ) and not stick_held):
+                    next_state.set(button, "pressed", 1.0)
+
+        if pickplace > 0:
+            # Check for placing the stick.
+            holder_rect = self.object_to_geom(self._holder, state)
+            if stick_held and fingers_closed and stick_rect.intersects(
+                    holder_rect):
+                # Place the stick back on the holder.
+                next_state.set(self._stick, "held", 0.0)
+                next_state.set(self._robot, "fingers", 1.0)
+
+            # Check if the stick is now held for the first time.
+            if not stick_held and stick_rect.intersects(robot_circ):
+                # Check for a collision with the stick holder. The reason that
+                # we only check for a collision here, as opposed to every
+                # timestep, is that we imagine the robot moving down in the z
+                # direction to pick up the stick after it has reached it on the
+                # x-y plane. On other timesteps, the robot would be high enough
+                # above the holder to avoid collisions.
+                if robot_circ.intersects(holder_rect):
+                    # No-op in case of collision.
+                    return state.copy()
+
+                next_state.set(self._stick, "held", 1.0)
+                next_state.set(self._robot, "fingers", 0.0)
+
+        tip_rect = self.stick_rect_to_tip_rect(stick_rect)
+        next_state.set(self._stick, "tip_x", tip_rect.x)
+        next_state.set(self._stick, "tip_y", tip_rect.y)
+
+        return next_state
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "stick_button_move"
diff --git a/predicators/envs/vlm_envs.py b/predicators/envs/vlm_envs.py
new file mode 100644
index 0000000000..9cba7d610e
--- /dev/null
+++ b/predicators/envs/vlm_envs.py
@@ -0,0 +1,153 @@
+"""A bunch of environments useful for testing VLM-based Predicate Invention.
+
+Will likely be updated and potentially split into separate files in the
+future.
+"""
+
+import abc
+from typing import List, Optional, Sequence, Set
+
+import matplotlib
+import numpy as np
+from gym.spaces import Box
+
+from predicators.envs import BaseEnv
+from predicators.settings import CFG
+from predicators.structs import Action, EnvironmentTask, GroundAtom, Object, \
+    Predicate, State, Type
+
+DUMMY_GOAL_OBJ_NAME = "dummy_goal_obj"  # used in VLM parsing as well.
+
+
+class VLMPredicateEnv(BaseEnv):
+    """Environments that use VLM Predicates.
+
+    Note that no simulate function or ground truth model is implemented
+    for these yet. These are forthcoming.
+    """
+
+    def __init__(self, use_gui: bool = True) -> None:
+        super().__init__(use_gui)
+
+        # Types
+        self._object_type = Type("object", [])
+        self._goal_object_type = Type("goal_object", ["goal_true"],
+                                      self._object_type)
+
+        # Predicates
+        self._DummyGoal = Predicate("DummyGoal", [self._goal_object_type],
+                                    self._Dummy_Goal_holds)
+
+    def simulate(self, state: State, action: Action) -> State:
+        raise ValueError("Simulate shouldn't be getting called!")
+
+    @property
+    def types(self) -> Set[Type]:
+        return {self._object_type, self._goal_object_type}
+
+    def _Dummy_Goal_holds(self, state: State,
+                          objects: Sequence[Object]) -> bool:
+        obj, = objects
+        return state.get(obj, "goal_true") > 0.5
+
+    def _generate_train_tasks(self) -> List[EnvironmentTask]:
+        return self._get_tasks(num=CFG.num_train_tasks, rng=self._train_rng)
+
+    def _generate_test_tasks(self) -> List[EnvironmentTask]:
+        return self._get_tasks(num=CFG.num_test_tasks, rng=self._test_rng)
+
+    @property
+    def predicates(self) -> Set[Predicate]:
+        return {self._DummyGoal}
+
+    @property
+    def goal_predicates(self) -> Set[Predicate]:
+        return {self._DummyGoal}
+
+    @property
+    def action_space(self) -> Box:
+        return Box(low=0.0, high=1.0, shape=(0, ), dtype=np.float32)
+
+    def render_state_plt(
+            self,
+            state: State,
+            task: EnvironmentTask,
+            action: Optional[Action] = None,
+            caption: Optional[str] = None) -> matplotlib.figure.Figure:
+        raise ValueError("shouldn't be trying to render env at any point!")
+
+    def _get_tasks(
+        self, num: int, rng: np.random.Generator
+    ) -> List[EnvironmentTask]:  # pragma: no cover.
+        del num, rng
+        raise NotImplementedError("Override!")
+
+    @property
+    @abc.abstractmethod
+    def vlm_debug_atom_strs(self) -> Set[str]:
+        """Return a set of atom strings that should be sufficient for a VLM to
+        label demonstrations consistently to learn good operators."""
+        raise NotImplementedError(
+            "VLM debug atom strings not implemented for this environment.")
+
+
+class IceTeaMakingEnv(VLMPredicateEnv):
+    """A (simplified) version of a tea-making task that's closer to pick-and-
+    place than real tea-making."""
+
+    def __init__(self, use_gui: bool = True) -> None:
+        super().__init__(use_gui)
+
+        # Env-specific types.
+        self._teabag_type = Type("teabag", [], self._object_type)
+        self._spoon_type = Type("spoon", [], self._object_type)
+        self._cup_type = Type("cup", [], self._object_type)
+        self._plate_type = Type("plate", [], self._object_type)
+        self._hand_type = Type("hand", [], self._object_type)
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "ice_tea_making"
+
+    @property
+    def types(self) -> Set[Type]:
+        return super().types | {
+            self._teabag_type, self._spoon_type, self._cup_type,
+            self._plate_type, self._hand_type
+        }
+
+    def _get_tasks(self, num: int,
+                   rng: np.random.Generator) -> List[EnvironmentTask]:
+        del rng  # unused.
+        dummy_goal_obj = Object(DUMMY_GOAL_OBJ_NAME, self._goal_object_type)
+        teabag_obj = Object("teabag", self._teabag_type)
+        spoon_obj = Object("spoon", self._spoon_type)
+        cup_obj = Object("cup", self._cup_type)
+        plate_obj = Object("plate", self._plate_type)
+        hand_obj = Object("hand", self._hand_type)
+        init_state = State({
+            dummy_goal_obj: np.array([0.0]),
+            teabag_obj: np.array([]),
+            plate_obj: np.array([]),
+            spoon_obj: np.array([]),
+            cup_obj: np.array([]),
+            hand_obj: np.array([])
+        })
+        return [
+            EnvironmentTask(
+                init_state,
+                set([GroundAtom(self._DummyGoal, [dummy_goal_obj])]))
+            for _ in range(num)
+        ]
+
+    @property
+    def vlm_debug_atom_strs(self) -> Set[str]:
+        """A 'debug grammar' set of predicates that should be sufficient for
+        completing the task; useful for comparing different methods of VLM
+        truth-value labelling given the same set of atom proposals to label."""
+        return set([
+            "hand_grasping_spoon(hand, spoon)",
+            "hand_grasping_teabag(hand, teabag)", "spoon_in_cup(spoon, cup)",
+            "spoon_on_plate(spoon, plate)", "teabag_in_cup(teabag, cup)",
+            "teabag_on_plate(teabag, plate)"
+        ])
diff --git a/predicators/ground_truth_models/ice_tea_making/__init__.py b/predicators/ground_truth_models/ice_tea_making/__init__.py
new file mode 100644
index 0000000000..f12182c3e2
--- /dev/null
+++ b/predicators/ground_truth_models/ice_tea_making/__init__.py
@@ -0,0 +1,8 @@
+"""Ground-truth models for blocks environment and variants."""
+
+from .nsrts import TeaMakingGroundTruthNSRTFactory
+from .options import TeaMakingGroundTruthOptionFactory
+
+__all__ = [
+    "TeaMakingGroundTruthNSRTFactory", "TeaMakingGroundTruthOptionFactory"
+]
diff --git a/predicators/ground_truth_models/ice_tea_making/nsrts.py b/predicators/ground_truth_models/ice_tea_making/nsrts.py
new file mode 100644
index 0000000000..ee0f6a60fa
--- /dev/null
+++ b/predicators/ground_truth_models/ice_tea_making/nsrts.py
@@ -0,0 +1,23 @@
+"""Ground-truth NSRTs for the blocks environment."""
+
+from typing import Dict, Set
+
+from predicators.ground_truth_models import GroundTruthNSRTFactory
+from predicators.structs import NSRT, ParameterizedOption, Predicate, Type
+
+
+class TeaMakingGroundTruthNSRTFactory(GroundTruthNSRTFactory):
+    """Ground-truth NSRTs for the apple_coring environment."""
+
+    @classmethod
+    def get_env_names(cls) -> Set[str]:
+        return {"ice_tea_making"}
+
+    @staticmethod
+    def get_nsrts(
+        env_name: str, types: Dict[str, Type], predicates: Dict[str,
+                                                                Predicate],
+        options: Dict[str,
+                      ParameterizedOption]) -> Set[NSRT]:  # pragma: no cover
+        # For now, there are just no NSRTs
+        return set()
diff --git a/predicators/ground_truth_models/ice_tea_making/options.py b/predicators/ground_truth_models/ice_tea_making/options.py
new file mode 100644
index 0000000000..69c65b8dda
--- /dev/null
+++ b/predicators/ground_truth_models/ice_tea_making/options.py
@@ -0,0 +1,59 @@
+"""Ground-truth options for the (non-pybullet) blocks environment."""
+
+from typing import Dict, Sequence, Set
+
+from gym.spaces import Box
+
+from predicators import utils
+from predicators.ground_truth_models import GroundTruthOptionFactory
+from predicators.structs import Action, Array, Object, ParameterizedOption, \
+    ParameterizedPolicy, Predicate, State, Type
+
+
+class TeaMakingGroundTruthOptionFactory(GroundTruthOptionFactory):
+    """Ground-truth options for the tea making environment."""
+
+    @classmethod
+    def get_env_names(cls) -> Set[str]:
+        return {"ice_tea_making"}
+
+    @classmethod
+    def get_options(
+            cls, env_name: str, types: Dict[str,
+                                            Type], predicates: Dict[str,
+                                                                    Predicate],
+            action_space: Box) -> Set[ParameterizedOption]:  # pragma: no cover
+
+        del env_name, predicates  # unused.
+
+        object_type = types["object"]
+        cup_type = types["cup"]
+        hand_type = types["hand"]
+
+        Pick = utils.SingletonParameterizedOption(
+            # variables: [teabag to pick]
+            # params: []
+            "pick",
+            cls._create_dummy_policy(action_space),
+            types=[object_type, hand_type])
+
+        PlaceInCup = utils.SingletonParameterizedOption(
+            # variables: [object to place, thing to place in]
+            # params: []
+            "place_in",
+            cls._create_dummy_policy(action_space),
+            types=[object_type, cup_type])
+
+        return {Pick, PlaceInCup}
+
+    @classmethod
+    def _create_dummy_policy(
+            cls, action_space: Box) -> ParameterizedPolicy:  # pragma: no cover
+        del action_space  # unused
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del state, memory, objects, params
+            raise ValueError("Shouldn't be attempting to run this policy!")
+
+        return policy
diff --git a/predicators/ground_truth_models/stick_button/nsrts.py b/predicators/ground_truth_models/stick_button/nsrts.py
index fc408d1850..8a76fa5ff6 100644
--- a/predicators/ground_truth_models/stick_button/nsrts.py
+++ b/predicators/ground_truth_models/stick_button/nsrts.py
@@ -45,8 +45,9 @@ def get_nsrts(env_name: str, types: Dict[str, Type],
         # RobotPressButtonFromNothing
         robot = Variable("?robot", robot_type)
         button = Variable("?button", button_type)
-        parameters = [robot, button]
-        option_vars = [robot, button]
+        stick = Variable("?stick", stick_type)
+        parameters = [robot, button, stick]
+        option_vars = [robot, button, stick]
         option = RobotPressButton
         preconditions = {
             LiftedAtom(HandEmpty, [robot]),
@@ -68,8 +69,9 @@ def get_nsrts(env_name: str, types: Dict[str, Type],
         robot = Variable("?robot", robot_type)
         button = Variable("?button", button_type)
         from_button = Variable("?from_button", button_type)
-        parameters = [robot, button, from_button]
-        option_vars = [robot, button]
+        stick = Variable("?stick", stick_type)
+        parameters = [robot, button, from_button, stick]
+        option_vars = [robot, button, stick]
         option = RobotPressButton
         preconditions = {
             LiftedAtom(HandEmpty, [robot]),
@@ -242,3 +244,279 @@ def place_stick_sampler(state: State, goal: Set[GroundAtom],
         nsrts.add(place_stick_nsrt)
 
         return nsrts
+
+
+class StickButtonMoveGroundTruthNSRTFactory(StickButtonGroundTruthNSRTFactory):
+    """Ground-truth NSRTs for the stick button environment with movement
+    options."""
+
+    @classmethod
+    def get_env_names(cls) -> Set[str]:
+        return {"stick_button_move"}
+
+    @staticmethod
+    def get_nsrts(env_name: str, types: Dict[str, Type],
+                  predicates: Dict[str, Predicate],
+                  options: Dict[str, ParameterizedOption]) -> Set[NSRT]:
+        # Types
+        robot_type = types["robot"]
+        button_type = types["button"]
+        stick_type = types["stick"]
+        holder_type = types["holder"]
+
+        # Predicates
+        Pressed = predicates["Pressed"]
+        RobotAboveButton = predicates["RobotAboveButton"]
+        StickAboveButton = predicates["StickAboveButton"]
+        Grasped = predicates["Grasped"]
+        HandEmpty = predicates["HandEmpty"]
+        AboveNoButton = predicates["AboveNoButton"]
+
+        # Options
+        RobotPressButton = options["RobotPressButton"]
+        PickStick = options["PickStick"]
+        StickPressButton = options["StickPressButton"]
+        PlaceStick = options["PlaceStick"]
+        RobotMoveToButton = options["RobotMoveToButton"]
+        StickMoveToButton = options["StickMoveToButton"]
+
+        nsrts = set()
+
+        # RobotMoveToButtonFromNothing
+        robot = Variable("?robot", robot_type)
+        button = Variable("?button", button_type)
+        parameters = [robot, button]
+        option_vars = [robot, button]
+        option = RobotMoveToButton
+        preconditions = {
+            LiftedAtom(AboveNoButton, []),
+            LiftedAtom(HandEmpty, [robot]),
+        }
+        add_effects = {
+            LiftedAtom(RobotAboveButton, [robot, button]),
+        }
+        delete_effects = {LiftedAtom(AboveNoButton, [])}
+        ignore_effects: Set[Predicate] = set()
+        robot_moveto_button_from_nothing_nsrt = NSRT(
+            "RobotMoveToButtonFromNothing", parameters, preconditions,
+            add_effects, delete_effects, ignore_effects, option, option_vars,
+            null_sampler)
+        nsrts.add(robot_moveto_button_from_nothing_nsrt)
+
+        # RobotMoveToButtonFromButton
+        robot = Variable("?robot", robot_type)
+        from_button = Variable("?from", button_type)
+        to_button = Variable("?to", button_type)
+        parameters = [robot, from_button, to_button]
+        option_vars = [robot, to_button]
+        option = RobotMoveToButton
+        preconditions = {
+            LiftedAtom(RobotAboveButton, [robot, from_button]),
+            LiftedAtom(HandEmpty, [robot]),
+        }
+        add_effects = {
+            LiftedAtom(RobotAboveButton, [robot, to_button]),
+        }
+        delete_effects = {LiftedAtom(RobotAboveButton, [robot, from_button])}
+        robot_moveto_button_from_button_nsrt = NSRT(
+            "RobotMoveToButtonFromButton", parameters, preconditions,
+            add_effects, delete_effects, ignore_effects, option, option_vars,
+            null_sampler)
+        nsrts.add(robot_moveto_button_from_button_nsrt)
+
+        # StickMoveToButtonFromButton
+        robot = Variable("?robot", robot_type)
+        stick = Variable("?stick", stick_type)
+        to_button = Variable("?to", button_type)
+        from_button = Variable("?from", button_type)
+        parameters = [robot, stick, to_button, from_button]
+        option_vars = [robot, to_button, stick]
+        option = StickMoveToButton
+        preconditions = {
+            LiftedAtom(Grasped, [robot, stick]),
+            LiftedAtom(StickAboveButton, [stick, from_button]),
+        }
+        add_effects = {
+            LiftedAtom(StickAboveButton, [stick, to_button]),
+        }
+        delete_effects = {LiftedAtom(StickAboveButton, [stick, from_button])}
+        ignore_effects = set()
+        stick_moveto_button_from_button_nsrt = NSRT(
+            "StickMoveToButtonFromButton", parameters, preconditions,
+            add_effects, delete_effects, ignore_effects, option, option_vars,
+            null_sampler)
+        nsrts.add(stick_moveto_button_from_button_nsrt)
+
+        # StickMoveToButtonFromNothing
+        robot = Variable("?robot", robot_type)
+        stick = Variable("?stick", stick_type)
+        button = Variable("?to", button_type)
+        parameters = [robot, stick, to_button]
+        option_vars = [robot, to_button, stick]
+        option = StickMoveToButton
+        preconditions = {
+            LiftedAtom(Grasped, [robot, stick]),
+            LiftedAtom(AboveNoButton, []),
+        }
+        add_effects = {
+            LiftedAtom(StickAboveButton, [stick, button]),
+        }
+        delete_effects = set()
+        stick_moveto_button_from_nothing_nsrt = NSRT(
+            "StickMoveToButtonFromNothing", parameters, preconditions,
+            add_effects, delete_effects, ignore_effects, option, option_vars,
+            null_sampler)
+        nsrts.add(stick_moveto_button_from_nothing_nsrt)
+
+        # RobotPressButton
+        robot = Variable("?robot", robot_type)
+        button = Variable("?button", button_type)
+        parameters = [robot, button]
+        option_vars = [robot, button]
+        option = RobotPressButton
+        preconditions = {
+            LiftedAtom(HandEmpty, [robot]),
+            LiftedAtom(RobotAboveButton, [robot, button]),
+        }
+        add_effects = {LiftedAtom(Pressed, [button])}
+        delete_effects = set()
+        robot_press_button_nsrt = NSRT("RobotPressButton", parameters,
+                                       preconditions, add_effects,
+                                       delete_effects, ignore_effects, option,
+                                       option_vars, null_sampler)
+        nsrts.add(robot_press_button_nsrt)
+
+        # PickStickFromNothing
+        robot = Variable("?robot", robot_type)
+        stick = Variable("?stick", stick_type)
+        parameters = [robot, stick]
+        option_vars = [robot, stick]
+        option = PickStick
+        preconditions = {
+            LiftedAtom(HandEmpty, [robot]),
+            LiftedAtom(AboveNoButton, []),
+        }
+        add_effects = {
+            LiftedAtom(Grasped, [robot, stick]),
+        }
+        delete_effects = {LiftedAtom(HandEmpty, [robot])}
+        ignore_effects = set()
+
+        def pick_stick_sampler(state: State, goal: Set[GroundAtom],
+                               rng: np.random.Generator,
+                               objs: Sequence[Object]) -> Array:
+            del state, goal, objs  # unused
+            # Normalized x position along the long dimension of the stick, in
+            # the center of the short dimension.
+            pick_pos = rng.uniform(0, 1)
+            return np.array([pick_pos], dtype=np.float32)
+
+        pick_stick_nsrt = NSRT("PickStickFromNothing", parameters,
+                               preconditions, add_effects, delete_effects,
+                               ignore_effects, option, option_vars,
+                               pick_stick_sampler)
+        nsrts.add(pick_stick_nsrt)
+
+        # PickStickFromButton
+        robot = Variable("?robot", robot_type)
+        stick = Variable("?stick", stick_type)
+        button = Variable("?from_button", button_type)
+        parameters = [robot, stick, button]
+        option_vars = [robot, stick]
+        option = PickStick
+        preconditions = {
+            LiftedAtom(HandEmpty, [robot]),
+            LiftedAtom(RobotAboveButton, [robot, button])
+        }
+        add_effects = {
+            LiftedAtom(Grasped, [robot, stick]),
+            LiftedAtom(AboveNoButton, [])
+        }
+        delete_effects = {
+            LiftedAtom(HandEmpty, [robot]),
+            LiftedAtom(RobotAboveButton, [robot, button]),
+        }
+        ignore_effects = set()
+        pick_stick_nsrt = NSRT("PickStickFromButton", parameters,
+                               preconditions, add_effects, delete_effects,
+                               ignore_effects, option, option_vars,
+                               pick_stick_sampler)
+        nsrts.add(pick_stick_nsrt)
+
+        # StickPressButton
+        robot = Variable("?robot", robot_type)
+        stick = Variable("?stick", stick_type)
+        button = Variable("?button", button_type)
+        parameters = [robot, stick, button]
+        option_vars = [robot, stick, button]
+        option = StickPressButton
+        preconditions = {
+            LiftedAtom(Grasped, [robot, stick]),
+            LiftedAtom(StickAboveButton, [stick, button])
+        }
+        add_effects = {LiftedAtom(Pressed, [button])}
+        delete_effects = set()
+        ignore_effects = set()
+        stick_button_nsrt = NSRT("StickPressButton", parameters, preconditions,
+                                 add_effects, delete_effects, ignore_effects,
+                                 option, option_vars, null_sampler)
+        nsrts.add(stick_button_nsrt)
+
+        # PlaceStickFromNothing
+        robot = Variable("?robot", robot_type)
+        stick = Variable("?stick", stick_type)
+        holder = Variable("?holder", holder_type)
+        parameters = [robot, stick, holder]
+        option_vars = [robot, stick, holder]
+        option = PlaceStick
+        preconditions = {
+            LiftedAtom(Grasped, [robot, stick]),
+            LiftedAtom(AboveNoButton, []),
+        }
+        add_effects = {
+            LiftedAtom(HandEmpty, [robot]),
+        }
+        delete_effects = {LiftedAtom(Grasped, [robot, stick])}
+        ignore_effects = set()
+
+        def place_stick_sampler(state: State, goal: Set[GroundAtom],
+                                rng: np.random.Generator,
+                                objs: Sequence[Object]) -> Array:
+            del state, goal, objs  # unused
+            # Normalized offset between hand and holder when placing.
+            place_pos = rng.uniform(-1, 1)
+            return np.array([place_pos], dtype=np.float32)
+
+        place_stick_nsrt = NSRT("PlaceStickFromNothing", parameters,
+                                preconditions, add_effects, delete_effects,
+                                ignore_effects, option, option_vars,
+                                place_stick_sampler)
+        nsrts.add(place_stick_nsrt)
+
+        # PlaceStickFromButton
+        robot = Variable("?robot", robot_type)
+        stick = Variable("?stick", stick_type)
+        holder = Variable("?holder", holder_type)
+        from_button = Variable("?from_button", button_type)
+        parameters = [robot, stick, holder, from_button]
+        option_vars = [robot, stick, holder]
+        option = PlaceStick
+        preconditions = {
+            LiftedAtom(StickAboveButton, [stick, from_button]),
+            LiftedAtom(Grasped, [robot, stick]),
+        }
+        add_effects = {
+            LiftedAtom(HandEmpty, [robot]),
+        }
+        delete_effects = {
+            LiftedAtom(Grasped, [robot, stick]),
+            LiftedAtom(StickAboveButton, [stick, from_button])
+        }
+        ignore_effects = set()
+        place_stick_nsrt = NSRT("PlaceStickFromButton", parameters,
+                                preconditions, add_effects, delete_effects,
+                                ignore_effects, option, option_vars,
+                                place_stick_sampler)
+        nsrts.add(place_stick_nsrt)
+
+        return nsrts
diff --git a/predicators/ground_truth_models/stick_button/options.py b/predicators/ground_truth_models/stick_button/options.py
index b148c120e1..a69c1c62c3 100644
--- a/predicators/ground_truth_models/stick_button/options.py
+++ b/predicators/ground_truth_models/stick_button/options.py
@@ -38,12 +38,12 @@ def _RobotPressButton_terminal(state: State, memory: Dict,
                                        objects: Sequence[Object],
                                        params: Array) -> bool:
             del memory, params  # unused
-            _, button = objects
+            _, button, _ = objects
             return Pressed.holds(state, [button])
 
         RobotPressButton = ParameterizedOption(
             "RobotPressButton",
-            types=[robot_type, button_type],
+            types=[robot_type, button_type, stick_type],
             params_space=Box(0, 1, (0, )),
             policy=cls._create_robot_press_button_policy(),
             initiable=lambda s, m, o, p: True,
@@ -110,10 +110,10 @@ def policy(state: State, memory: Dict, objects: Sequence[Object],
                    params: Array) -> Action:
             del memory, params  # unused
             # If the robot and button are already pressing, press.
-            if StickButtonEnv.Above_holds(state, objects):
+            if StickButtonEnv.Above_holds(state, objects[:2]):
                 return Action(np.array([0.0, 0.0, 0.0, 1.0], dtype=np.float32))
             # Otherwise, move toward the button.
-            robot, button = objects
+            robot, button, _ = objects
             rx = state.get(robot, "x")
             ry = state.get(robot, "y")
             px = state.get(button, "x")
@@ -242,3 +242,274 @@ def _get_stick_grasp_loc(cls, state: State, stick: Object,
         tx = sx + scale * np.cos(stheta)
         ty = sy + scale * np.sin(stheta)
         return (tx, ty)
+
+
+class StickButtonMovementGroundTruthOptionFactory(
+        StickButtonGroundTruthOptionFactory):
+    """Ground-truth options for the stick button environment."""
+
+    @classmethod
+    def get_env_names(cls) -> Set[str]:
+        return {"stick_button_move"}
+
+    @classmethod
+    def get_options(cls, env_name: str, types: Dict[str, Type],
+                    predicates: Dict[str, Predicate],
+                    action_space: Box) -> Set[ParameterizedOption]:
+
+        # First, instantiate the original pick and place options,
+        # but (1) override the policies for RobootPressButton and
+        # StickPressButton to make them no longer move the robot, and (2)
+        # redefine RobotPressButton to update its arguments.
+        init_options = super().get_options(env_name, types, predicates,
+                                           action_space)
+        robot_type = types["robot"]
+        button_type = types["button"]
+        stick_type = types["stick"]
+        holder_type = types["holder"]
+
+        RobotAboveButton = predicates["RobotAboveButton"]
+        StickAboveButton = predicates["StickAboveButton"]
+        Pressed = predicates["Pressed"]
+        Grasped = predicates["Grasped"]
+
+        # RobotMoveToButton
+        def _RobotMoveToButton_terminal(state: State, memory: Dict,
+                                        objects: Sequence[Object],
+                                        params: Array) -> bool:
+            del memory, params  # unused
+            robot, button = objects
+            return RobotAboveButton.holds(state, [robot, button])
+
+        RobotMoveToButton = ParameterizedOption(
+            "RobotMoveToButton",
+            types=[robot_type, button_type],
+            params_space=Box(0, 1, (0, )),
+            policy=cls._create_robot_moveto_button_policy(),
+            initiable=lambda s, m, o, p: True,
+            terminal=_RobotMoveToButton_terminal,
+        )
+
+        # StickMoveToButton
+        def _StickMoveToButton_terminal(state: State, memory: Dict,
+                                        objects: Sequence[Object],
+                                        params: Array) -> bool:
+            del memory, params  # unused
+            _, button, stick = objects
+            return StickAboveButton.holds(state, [stick, button])
+
+        StickMoveToButton = ParameterizedOption(
+            "StickMoveToButton",
+            types=[robot_type, button_type, stick_type],
+            params_space=Box(0, 1, (0, )),
+            policy=cls._create_stick_moveto_button_policy(),
+            initiable=lambda s, m, o, p: True,
+            terminal=_StickMoveToButton_terminal,
+        )
+
+        # RobotPressButton
+        def _RobotPressButton_terminal(state: State, memory: Dict,
+                                       objects: Sequence[Object],
+                                       params: Array) -> bool:
+            del memory, params  # unused
+            _, button = objects
+            return Pressed.holds(state, [button])
+
+        RobotPressButton = ParameterizedOption(
+            "RobotPressButton",
+            types=[robot_type, button_type],
+            params_space=Box(0, 1, (0, )),
+            policy=cls._create_robot_press_button_policy(),
+            initiable=lambda s, m, o, p: True,
+            terminal=_RobotPressButton_terminal,
+        )
+
+        # PlaceStick
+        def _PlaceStick_terminal(state: State, memory: Dict,
+                                 objects: Sequence[Object],
+                                 params: Array) -> bool:
+            del memory, params  # unused
+            robot, stick, _ = objects
+            return not Grasped.holds(state, [robot, stick])
+
+        PlaceStick = ParameterizedOption(
+            "PlaceStick",
+            types=[robot_type, stick_type, holder_type],
+            params_space=Box(-1, 1, (1, )),
+            policy=cls._create_place_stick_policy_diff_signature(),
+            initiable=lambda s, m, o, p: True,
+            terminal=_PlaceStick_terminal,
+        )
+
+        unchanged_options = {
+            opt
+            for opt in init_options
+            if opt.name not in ["RobotPressButton", "PlaceStick"]
+        }
+        changed_options = {RobotPressButton, PlaceStick}
+        new_options = {RobotMoveToButton, StickMoveToButton}
+
+        return unchanged_options | changed_options | new_options
+
+    @classmethod
+    def _create_robot_moveto_button_policy(cls) -> ParameterizedPolicy:
+
+        max_speed = StickButtonEnv.max_speed
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del memory, params  # unused.
+            # Otherwise, move toward the button.
+            robot, button = objects
+            rx = state.get(robot, "x")
+            ry = state.get(robot, "y")
+            px = state.get(button, "x")
+            py = state.get(button, "y")
+            dx = np.clip(px - rx, -max_speed, max_speed)
+            dy = np.clip(py - ry, -max_speed, max_speed)
+            # Normalize.
+            dx = dx / max_speed
+            dy = dy / max_speed
+            # No need to rotate, and we don't want to press until we're there.
+            return Action(np.array([dx, dy, 0.0, -1.0, -1.0],
+                                   dtype=np.float32))
+
+        return policy
+
+    @classmethod
+    def _create_stick_moveto_button_policy(cls) -> ParameterizedPolicy:
+
+        max_speed = StickButtonEnv.max_speed
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del memory, params  # unused
+            _, button, stick = objects
+            stick_rect = StickButtonEnv.object_to_geom(stick, state)
+            assert isinstance(stick_rect, utils.Rectangle)
+            tip_rect = StickButtonEnv.stick_rect_to_tip_rect(stick_rect)
+            # If the stick is vertical, move the tip toward the button.
+            stheta = state.get(stick, "theta")
+            desired_theta = np.pi / 2
+            if abs(stheta - desired_theta) < 1e-3:
+                tx = tip_rect.x
+                ty = tip_rect.y
+                px = state.get(button, "x")
+                py = state.get(button, "y")
+                dx = np.clip(px - tx, -max_speed, max_speed)
+                dy = np.clip(py - ty, -max_speed, max_speed)
+                # Normalize.
+                dx = dx / max_speed
+                dy = dy / max_speed
+                # No need to rotate or press.
+                return Action(
+                    np.array([dx, dy, 0.0, -1.0, -1.0], dtype=np.float32))
+            assert not CFG.stick_button_disable_angles
+            # Otherwise, rotate the stick.
+            dtheta = np.clip(desired_theta - stheta,
+                             -StickButtonEnv.max_angular_speed,
+                             StickButtonEnv.max_angular_speed)
+            # Normalize.
+            dtheta = dtheta / StickButtonEnv.max_angular_speed
+            return Action(
+                np.array([0.0, 0.0, dtheta, -1.0, -1.0], dtype=np.float32))
+
+        return policy
+
+    @classmethod
+    def _create_robot_press_button_policy(cls) -> ParameterizedPolicy:
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del memory, params  # unused
+            robot, button = objects
+            action = Action(
+                np.array([0.0, 0.0, 0.0, -1.0, -1.0], dtype=np.float32))
+            # If the robot is above the button, press.
+            if StickButtonEnv.Above_holds(state, [robot, button]):
+                action = Action(
+                    np.array([0.0, 0.0, 0.0, 1.0, -1.0], dtype=np.float32))
+            # Else, do nothing.
+            return action
+
+        return policy
+
+    @classmethod
+    def _create_stick_press_button_policy(cls) -> ParameterizedPolicy:
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del memory, params  # unused
+            _, stick, button = objects
+            button_circ = StickButtonEnv.object_to_geom(button, state)
+            stick_rect = StickButtonEnv.object_to_geom(stick, state)
+            assert isinstance(stick_rect, utils.Rectangle)
+            tip_rect = StickButtonEnv.stick_rect_to_tip_rect(stick_rect)
+            # If the stick tip is above the button, press.
+            if tip_rect.intersects(button_circ):
+                return Action(
+                    np.array([0.0, 0.0, 0.0, 1.0, -1.0], dtype=np.float32))
+            # Else, do nothing.
+            return Action(
+                np.array([0.0, 0.0, 0.0, -1.0, -1.0], dtype=np.float32))
+
+        return policy
+
+    @classmethod
+    def _create_pick_stick_policy(cls) -> ParameterizedPolicy:
+
+        max_speed = StickButtonEnv.max_speed
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del memory  # unused
+            robot, stick = objects
+            rx = state.get(robot, "x")
+            ry = state.get(robot, "y")
+            tx, ty = cls._get_stick_grasp_loc(state, stick, params)
+            # If we're close enough to the grasp location, pickplace.
+            if (tx - rx)**2 + (ty - ry)**2 < StickButtonEnv.pick_grasp_tol:
+                return Action(
+                    np.array([0.0, 0.0, 0.0, -1.0, 1.0], dtype=np.float32))
+            # Move toward the target.
+            dx = np.clip(tx - rx, -max_speed, max_speed)
+            dy = np.clip(ty - ry, -max_speed, max_speed)
+            # Normalize.
+            dx = dx / max_speed
+            dy = dy / max_speed
+            # No need to rotate or press.
+            return Action(np.array([dx, dy, 0.0, -1.0, -1.0],
+                                   dtype=np.float32))
+
+        return policy
+
+    @classmethod
+    def _create_place_stick_policy_diff_signature(cls) -> ParameterizedPolicy:
+
+        max_speed = StickButtonEnv.max_speed
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del memory  # unused
+            robot, _, holder = objects
+            norm_offset_y, = params
+            offset_y = (StickButtonEnv.stick_width / 2) * norm_offset_y
+            rx = state.get(robot, "x")
+            ry = state.get(robot, "y")
+            tx = state.get(holder, "x") - StickButtonEnv.holder_height / 2
+            ty = state.get(holder, "y") + offset_y
+            # If we're close enough, put the stick down.
+            if (tx - rx)**2 + (ty - ry)**2 < StickButtonEnv.pick_grasp_tol:
+                return Action(
+                    np.array([0.0, 0.0, 0.0, -1.0, 1.0], dtype=np.float32))
+            # Move toward the target.
+            dx = np.clip(tx - rx, -max_speed, max_speed)
+            dy = np.clip(ty - ry, -max_speed, max_speed)
+            # Normalize.
+            dx = dx / max_speed
+            dy = dy / max_speed
+            # No need to rotate or press.
+            return Action(np.array([dx, dy, 0.0, -1.0, -1.0],
+                                   dtype=np.float32))
+
+        return policy
diff --git a/predicators/llm_interface.py b/predicators/llm_interface.py
deleted file mode 100644
index e4d84578e3..0000000000
--- a/predicators/llm_interface.py
+++ /dev/null
@@ -1,137 +0,0 @@
-"""Interface to pretrained large language models."""
-
-import abc
-import logging
-import os
-from typing import List, Optional
-
-import openai
-
-from predicators.settings import CFG
-
-# This is a special string that we assume will never appear in a prompt, and
-# which we use to separate prompt and completion in the cache. The reason to
-# do it this way, rather than saving the prompt and responses separately,
-# is that we want it to be easy to browse the cache as text files.
-_CACHE_SEP = "\n####$$$###$$$####$$$$###$$$####$$$###$$$###\n"
-
-
-class LargeLanguageModel(abc.ABC):
-    """A pretrained large language model."""
-
-    @abc.abstractmethod
-    def get_id(self) -> str:
-        """Get a string identifier for this LLM.
-
-        This identifier should include sufficient information so that
-        querying the same model with the same prompt and same identifier
-        should yield the same result (assuming temperature 0).
-        """
-        raise NotImplementedError("Override me!")
-
-    @abc.abstractmethod
-    def _sample_completions(self,
-                            prompt: str,
-                            temperature: float,
-                            seed: int,
-                            stop_token: Optional[str] = None,
-                            num_completions: int = 1) -> List[str]:
-        """This is the main method that subclasses must implement.
-
-        This helper method is called by sample_completions(), which
-        caches the prompts and responses to disk.
-        """
-        raise NotImplementedError("Override me!")
-
-    def sample_completions(self,
-                           prompt: str,
-                           temperature: float,
-                           seed: int,
-                           stop_token: Optional[str] = None,
-                           num_completions: int = 1) -> List[str]:
-        """Sample one or more completions from a prompt.
-
-        Higher temperatures will increase the variance in the responses.
-
-        The seed may not be used and the results may therefore not be
-        reproducible for LLMs where we only have access through an API that
-        does not expose the ability to set a random seed.
-
-        Responses are saved to disk.
-        """
-        # Set up the cache file.
-        assert _CACHE_SEP not in prompt
-        os.makedirs(CFG.llm_prompt_cache_dir, exist_ok=True)
-        llm_id = self.get_id()
-        prompt_id = hash(prompt)
-        # If the temperature is 0, the seed does not matter.
-        if temperature == 0.0:
-            config_id = f"most_likely_{num_completions}_{stop_token}"
-        else:
-            config_id = f"{temperature}_{seed}_{num_completions}_{stop_token}"
-        cache_filename = f"{llm_id}_{config_id}_{prompt_id}.txt"
-        cache_filepath = os.path.join(CFG.llm_prompt_cache_dir, cache_filename)
-        if not os.path.exists(cache_filepath):
-            if CFG.llm_use_cache_only:
-                raise ValueError("No cached response found for LLM prompt.")
-            logging.debug(f"Querying LLM {llm_id} with new prompt.")
-            # Query the LLM.
-            completions = self._sample_completions(prompt, temperature, seed,
-                                                   stop_token, num_completions)
-            # Cache the completion.
-            cache_str = prompt + _CACHE_SEP + _CACHE_SEP.join(completions)
-            with open(cache_filepath, 'w', encoding='utf-8') as f:
-                f.write(cache_str)
-            logging.debug(f"Saved LLM response to {cache_filepath}.")
-        # Load the saved completion.
-        with open(cache_filepath, 'r', encoding='utf-8') as f:
-            cache_str = f.read()
-        logging.debug(f"Loaded LLM response from {cache_filepath}.")
-        assert cache_str.count(_CACHE_SEP) == num_completions
-        cached_prompt, completion_strs = cache_str.split(_CACHE_SEP, 1)
-        assert cached_prompt == prompt
-        completions = completion_strs.split(_CACHE_SEP)
-        return completions
-
-
-class OpenAILLM(LargeLanguageModel):
-    """Interface to openAI LLMs (GPT-3).
-
-    Assumes that an environment variable OPENAI_API_KEY is set to a
-    private API key for beta.openai.com.
-    """
-
-    def __init__(self, model_name: str) -> None:
-        """See https://beta.openai.com/docs/models/gpt-3 for the list of
-        available model names."""
-        self._model_name = model_name
-        # Note that max_tokens is the maximum response length (not prompt).
-        # From OpenAI docs: "The token count of your prompt plus max_tokens
-        # cannot exceed the model's context length."
-        self._max_tokens = CFG.llm_openai_max_response_tokens
-        assert "OPENAI_API_KEY" in os.environ
-        openai.api_key = os.getenv("OPENAI_API_KEY")
-
-    def get_id(self) -> str:
-        return f"openai-{self._model_name}"
-
-    def _sample_completions(
-            self,
-            prompt: str,
-            temperature: float,
-            seed: int,
-            stop_token: Optional[str] = None,
-            num_completions: int = 1) -> List[str]:  # pragma: no cover
-        del seed  # unused
-        response = openai.Completion.create(
-            model=self._model_name,  # type: ignore
-            prompt=prompt,
-            temperature=temperature,
-            max_tokens=self._max_tokens,
-            stop=stop_token,
-            n=num_completions)
-        assert len(response["choices"]) == num_completions
-        text_responses = [
-            response["choices"][i]["text"] for i in range(num_completions)
-        ]
-        return text_responses
diff --git a/predicators/pretrained_model_interface.py b/predicators/pretrained_model_interface.py
new file mode 100644
index 0000000000..8bee546259
--- /dev/null
+++ b/predicators/pretrained_model_interface.py
@@ -0,0 +1,251 @@
+"""Interface to pretrained large models.
+
+These might be joint Vision-Language Models (VLM's) or Large Language
+Models (LLM's)
+"""
+
+import abc
+import logging
+import os
+import time
+from typing import List, Optional
+
+import google
+import google.generativeai as genai
+import imagehash
+import openai
+import PIL.Image
+
+from predicators.settings import CFG
+
+# This is a special string that we assume will never appear in a prompt, and
+# which we use to separate prompt and completion in the cache. The reason to
+# do it this way, rather than saving the prompt and responses separately,
+# is that we want it to be easy to browse the cache as text files.
+_CACHE_SEP = "\n####$$$###$$$####$$$$###$$$####$$$###$$$###\n"
+
+
+class PretrainedLargeModel(abc.ABC):
+    """A pretrained large vision or language model."""
+
+    @abc.abstractmethod
+    def get_id(self) -> str:
+        """Get a string identifier for this model.
+
+        This identifier should include sufficient information so that
+        querying the same model with the same prompt and same identifier
+        should yield the same result (assuming temperature 0).
+        """
+        raise NotImplementedError("Override me!")
+
+    @abc.abstractmethod
+    def _sample_completions(self,
+                            prompt: str,
+                            imgs: Optional[List[PIL.Image.Image]],
+                            temperature: float,
+                            seed: int,
+                            stop_token: Optional[str] = None,
+                            num_completions: int = 1) -> List[str]:
+        """This is the main method that subclasses must implement.
+
+        This helper method is called by sample_completions(), which
+        caches the prompts and responses to disk.
+        """
+        raise NotImplementedError("Override me!")
+
+    def sample_completions(self,
+                           prompt: str,
+                           imgs: Optional[List[PIL.Image.Image]],
+                           temperature: float,
+                           seed: int,
+                           stop_token: Optional[str] = None,
+                           num_completions: int = 1) -> List[str]:
+        """Sample one or more completions from a prompt.
+
+        Higher temperatures will increase the variance in the responses.
+        The seed may not be used and the results may therefore not be
+        reproducible for models where we only have access through an API
+        that does not expose the ability to set a random seed. Responses
+        are saved to disk.
+        """
+        # Set up the cache file.
+        assert _CACHE_SEP not in prompt
+        os.makedirs(CFG.pretrained_model_prompt_cache_dir, exist_ok=True)
+        model_id = self.get_id()
+        prompt_id = hash(prompt)
+        config_id = f"{temperature}_{seed}_{num_completions}_" + \
+                f"{stop_token}"
+        # If the temperature is 0, the seed does not matter.
+        if temperature == 0.0:
+            config_id = f"most_likely_{num_completions}_{stop_token}"
+        cache_foldername = f"{model_id}_{config_id}_{prompt_id}"
+        if imgs is not None:
+            # We also need to hash all the images in the prompt.
+            img_hash_list: List[str] = []
+            for img in imgs:
+                img_hash_list.append(str(imagehash.phash(img)))
+            # NOTE: it's very possible that this string gets too long and this
+            # causes significant problems for us. We can fix this when it
+            # comes up by hashing this string to a shorter string, using e.g.
+            # https://stackoverflow.com/questions/57263436/hash-like-string-shortener-with-decoder  # pylint:disable=line-too-long
+            imgs_id = "".join(img_hash_list)
+            cache_foldername += f"{imgs_id}"
+        cache_folderpath = os.path.join(CFG.pretrained_model_prompt_cache_dir,
+                                        cache_foldername)
+        os.makedirs(cache_folderpath, exist_ok=True)
+        cache_filename = "prompt.txt"
+        cache_filepath = os.path.join(CFG.pretrained_model_prompt_cache_dir,
+                                      cache_foldername, cache_filename)
+        if not os.path.exists(cache_filepath):
+            if CFG.llm_use_cache_only:
+                raise ValueError("No cached response found for prompt.")
+            logging.debug(f"Querying model {model_id} with new prompt.")
+            # Query the model.
+            completions = self._sample_completions(prompt, imgs, temperature,
+                                                   seed, stop_token,
+                                                   num_completions)
+            # Cache the completion.
+            cache_str = prompt + _CACHE_SEP + _CACHE_SEP.join(completions)
+            with open(cache_filepath, 'w', encoding='utf-8') as f:
+                f.write(cache_str)
+            if imgs is not None:
+                # Also save the images for easy debugging.
+                imgs_folderpath = os.path.join(cache_folderpath, "imgs")
+                os.makedirs(imgs_folderpath, exist_ok=True)
+                for i, img in enumerate(imgs):
+                    filename_suffix = str(i) + ".jpg"
+                    img.save(os.path.join(imgs_folderpath, filename_suffix))
+            logging.debug(f"Saved model response to {cache_filepath}.")
+        # Load the saved completion.
+        with open(cache_filepath, 'r', encoding='utf-8') as f:
+            cache_str = f.read()
+        logging.debug(f"Loaded model response from {cache_filepath}.")
+        assert cache_str.count(_CACHE_SEP) == num_completions
+        cached_prompt, completion_strs = cache_str.split(_CACHE_SEP, 1)
+        assert cached_prompt == prompt
+        completions = completion_strs.split(_CACHE_SEP)
+        return completions
+
+
+class VisionLanguageModel(PretrainedLargeModel):
+    """A class for all VLM's."""
+
+    def sample_completions(
+            self,
+            prompt: str,
+            imgs: Optional[List[PIL.Image.Image]],
+            temperature: float,
+            seed: int,
+            stop_token: Optional[str] = None,
+            num_completions: int = 1) -> List[str]:  # pragma: no cover
+        assert imgs is not None
+        return super().sample_completions(prompt, imgs, temperature, seed,
+                                          stop_token, num_completions)
+
+
+class LargeLanguageModel(PretrainedLargeModel):
+    """A class for all LLM's."""
+
+    def sample_completions(
+            self,
+            prompt: str,
+            imgs: Optional[List[PIL.Image.Image]],
+            temperature: float,
+            seed: int,
+            stop_token: Optional[str] = None,
+            num_completions: int = 1) -> List[str]:  # pragma: no cover
+        assert imgs is None
+        return super().sample_completions(prompt, imgs, temperature, seed,
+                                          stop_token, num_completions)
+
+
+class OpenAILLM(LargeLanguageModel):
+    """Interface to openAI LLMs (GPT-3).
+
+    Assumes that an environment variable OPENAI_API_KEY is set to a
+    private API key for beta.openai.com.
+    """
+
+    def __init__(self, model_name: str) -> None:
+        """See https://beta.openai.com/docs/models/gpt-3 for the list of
+        available model names."""
+        self._model_name = model_name
+        # Note that max_tokens is the maximum response length (not prompt).
+        # From OpenAI docs: "The token count of your prompt plus max_tokens
+        # cannot exceed the model's context length."
+        self._max_tokens = CFG.llm_openai_max_response_tokens
+        assert "OPENAI_API_KEY" in os.environ
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+
+    def get_id(self) -> str:
+        return f"openai-{self._model_name}"
+
+    def _sample_completions(
+            self,
+            prompt: str,
+            imgs: Optional[List[PIL.Image.Image]],
+            temperature: float,
+            seed: int,
+            stop_token: Optional[str] = None,
+            num_completions: int = 1) -> List[str]:  # pragma: no cover
+        del imgs, seed  # unused
+        response = openai.Completion.create(
+            model=self._model_name,  # type: ignore
+            prompt=prompt,
+            temperature=temperature,
+            max_tokens=self._max_tokens,
+            stop=stop_token,
+            n=num_completions)
+        assert len(response["choices"]) == num_completions
+        text_responses = [
+            response["choices"][i]["text"] for i in range(num_completions)
+        ]
+        return text_responses
+
+
+class GoogleGeminiVLM(VisionLanguageModel):
+    """Interface to the Google Gemini VLM (1.5).
+
+    Assumes that an environment variable GOOGLE_API_KEY is set with the
+    necessary API key to query the particular model name.
+    """
+
+    def __init__(self, model_name: str) -> None:
+        """See https://ai.google.dev/models/gemini for the list of available
+        model names."""
+        self._model_name = model_name
+        assert "GOOGLE_API_KEY" in os.environ
+        genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+        self._model = genai.GenerativeModel(self._model_name)  # pylint:disable=no-member
+
+    def get_id(self) -> str:
+        return f"Google-{self._model_name}"
+
+    def _sample_completions(
+            self,
+            prompt: str,
+            imgs: Optional[List[PIL.Image.Image]],
+            temperature: float,
+            seed: int,
+            stop_token: Optional[str] = None,
+            num_completions: int = 1) -> List[str]:  # pragma: no cover
+        del seed, stop_token  # unused
+        assert imgs is not None
+        generation_config = genai.types.GenerationConfig(  # pylint:disable=no-member
+            candidate_count=num_completions,
+            temperature=temperature)
+        response = None
+        while response is None:
+            try:
+                response = self._model.generate_content(
+                    [prompt] + imgs,
+                    generation_config=generation_config)  # type: ignore
+                break
+            except google.api_core.exceptions.ResourceExhausted:
+                # In this case, we've hit a rate limit. Simply wait 3s and
+                # try again.
+                logging.debug(
+                    "Hit rate limit for Gemini queries; trying again in 3s!")
+                time.sleep(3.0)
+        response.resolve()
+        return [response.text]
diff --git a/predicators/settings.py b/predicators/settings.py
index f3957cf75d..02348dace7 100644
--- a/predicators/settings.py
+++ b/predicators/settings.py
@@ -400,7 +400,7 @@ class GlobalSettings:
     nsrt_rl_valid_reward_steps_threshold = 10
 
     # parameters for large language models
-    llm_prompt_cache_dir = "llm_cache"
+    pretrained_model_prompt_cache_dir = "pretrained_model_cache"
     llm_openai_max_response_tokens = 700
     llm_use_cache_only = False
     llm_model_name = "text-curie-001"  # "text-davinci-002"
@@ -408,6 +408,9 @@ class GlobalSettings:
     llm_num_completions = 1
     override_json_with_input = False  # Only works with SpotEnv for now
 
+    # parameters for vision language models
+    vlm_model_name = "gemini-pro-vision"  # "gemini-1.5-pro-latest"
+
     # SeSamE parameters
     sesame_task_planner = "astar"  # "astar" or "fdopt" or "fdsat"
     sesame_task_planning_heuristic = "lmcut"
@@ -625,6 +628,7 @@ class GlobalSettings:
     grammar_search_grammar_includes_givens = True
     grammar_search_grammar_includes_foralls = True
     grammar_search_grammar_use_diff_features = False
+    grammar_search_grammar_use_euclidean_dist = False
     grammar_search_use_handcoded_debug_grammar = False
     grammar_search_pred_selection_approach = "score_optimization"
     grammar_search_pred_clusterer = "oracle"
@@ -635,6 +639,7 @@ class GlobalSettings:
     grammar_search_pred_complexity_weight = 1e-4
     grammar_search_max_predicates = 200
     grammar_search_predicate_cost_upper_bound = 6
+    grammar_search_prune_redundant_preds = True
     grammar_search_score_function = "expected_nodes_created"
     grammar_search_heuristic_based_weight = 10.
     grammar_search_max_demos = float("inf")
@@ -653,10 +658,20 @@ class GlobalSettings:
     grammar_search_expected_nodes_backtracking_cost = 1e3
     grammar_search_expected_nodes_allow_noops = True
     grammar_search_classifier_pretty_str_names = ["?x", "?y", "?z"]
+    grammar_search_vlm_atom_proposal_prompt_type = "options_labels_whole_traj"
+    grammar_search_vlm_atom_label_prompt_type = "per_scene_naive"
+    grammar_search_vlm_atom_proposal_use_debug = False
 
     # grammar search clustering algorithm parameters
     grammar_search_clustering_gmm_num_components = 10
 
+    # filepath to be used if offline_data_method is set to
+    # demo+labelled_atoms
+    handmade_demo_filename = ""
+    # filepath to be used if offline_data_method is set to
+    # img_demos
+    vlm_trajs_folder_name = ""
+
     @classmethod
     def get_arg_specific_settings(cls, args: Dict[str, Any]) -> Dict[str, Any]:
         """A workaround for global settings that are derived from the
@@ -737,6 +752,7 @@ def get_arg_specific_settings(cls, args: Dict[str, Any]) -> Dict[str, Any]:
                     "exit_garage": 1000,
                     "tools": 1000,
                     "stick_button": 1000,
+                    "stick_button_move": 1000
                 })[args.get("env", "")],
 
             # In SeSamE, the maximum effort put into refining a single skeleton.
@@ -758,6 +774,36 @@ def get_arg_specific_settings(cls, args: Dict[str, Any]) -> Dict[str, Any]:
                     "tools": 1,
                 })[args.get("env", "")],
 
+            # Factor to divide feature range by when instantiating predicates
+            # of the form |t1.f1 - t2.f2| < c to indicate that t1.f1 and
+            # t2.f2 are "touching" or close. E.g. for the predicate
+            # |robot.x - button.x| < c in the StickButtonMovement env,
+            # we set this constant to 1/60.0 because that will yield
+            # |robot.x - button.x| < ((ub - lb)/60.0) + ub, which corresponds
+            # to a predicate that correctly classifies when the robot and
+            # button are touching.
+            grammar_search_diff_features_const_multiplier=defaultdict(
+                lambda: 1e-6,
+                {"stick_button_move": 1 / 30.0})[args.get("env", "")],
+
+            # Feature names to use as part of the EuclideanPredicateGrammar.
+            # Each entry is (type1_feature1name, type1_feature2name,
+            # type2_feature1name, type2_feature2name)
+            grammar_search_euclidean_feature_names=defaultdict(
+                lambda: [("x", "y", "x", "y")], {
+                    "stick_button_move": [("x", "y", "x", "y"),
+                                          ("x", "y", "tip_x", "tip_y")]
+                })[args.get("env", "")],
+
+            # Factor to divide feature range by when instantiating euclidean
+            # predicates of the form
+            # (t1.f1 - t2.f1)^2 + (t1.f2 - t2.f2)^2 < c^2 to indicate that
+            # the euclidean distance between f1 and f2 is close enough that.
+            # the two objects are "touching".
+            grammar_search_euclidean_const_multiplier=defaultdict(
+                lambda: 1e-6,
+                {"stick_button_move": 1 / 250.0})[args.get("env", "")],
+
             # Parameters specific to the cover environment.
             # cover env parameters
             cover_num_blocks=defaultdict(lambda: 2, {
diff --git a/predicators/structs.py b/predicators/structs.py
index 651e09e520..4fd65ecf09 100644
--- a/predicators/structs.py
+++ b/predicators/structs.py
@@ -10,6 +10,7 @@
     List, Optional, Sequence, Set, Tuple, TypeVar, Union, cast
 
 import numpy as np
+import PIL.Image
 from gym.spaces import Box
 from numpy.typing import NDArray
 from tabulate import tabulate
@@ -29,6 +30,13 @@ def dim(self) -> int:
         """Dimensionality of the feature vector of this object type."""
         return len(self.feature_names)
 
+    @property
+    def oldest_ancestor(self) -> Type:
+        """Crawl up all the parent types to return the one at the top."""
+        if self.parent is None:
+            return self
+        return self.parent.oldest_ancestor
+
     def __call__(self, name: str) -> _TypedEntity:
         """Convenience method for generating _TypedEntities."""
         if name.startswith("?"):
@@ -1119,6 +1127,46 @@ def train_task_idx(self) -> int:
         return self._train_task_idx
 
 
+@dataclass(frozen=True, repr=False, eq=False)
+class ImageOptionTrajectory:
+    """A structure similar to a LowLevelTrajectory where instead of low-level
+    states and actions, we record images at every state (i.e., observations),
+    as well as the option that was executed to get between observation images.
+
+    Invariant 1: If this trajectory is a demonstration, it must contain
+    a train task idx and achieve the goal in the respective train task.
+    This invariant is checked upon creation of the trajectory (in
+    datasets) because the trajectory does not have a goal, it only has a
+    train task idx. Invariant 2: The length of the state images sequence
+    is always one greater than the length of the action sequence.
+    """
+    _objects: Collection[Object]
+    _state_imgs: List[List[PIL.Image.Image]]
+    _actions: List[_Option]
+    _is_demo: bool = field(default=False)
+    _train_task_idx: Optional[int] = field(default=None)
+
+    def __post_init__(self) -> None:
+        assert len(self._state_imgs) == len(self._actions) + 1
+        if self._is_demo:
+            assert self._train_task_idx is not None
+
+    @property
+    def imgs(self) -> List[List[PIL.Image.Image]]:
+        """States in the trajectory."""
+        return self._state_imgs
+
+    @property
+    def objects(self) -> Collection[Object]:
+        """Objects important to the trajectory."""
+        return self._objects
+
+    @property
+    def actions(self) -> List[_Option]:
+        """Actions in the trajectory."""
+        return self._actions
+
+
 @dataclass(repr=False, eq=False)
 class Dataset:
     """A collection of LowLevelTrajectory objects, and optionally, lists of
diff --git a/predicators/third_party/fast_downward_translator/simplify.py b/predicators/third_party/fast_downward_translator/simplify.py
index 8dc63865fc..5d73f9082c 100644
--- a/predicators/third_party/fast_downward_translator/simplify.py
+++ b/predicators/third_party/fast_downward_translator/simplify.py
@@ -43,7 +43,7 @@ class DomainTransitionGraph:
     Attributes:
     - init (int): the initial state value of the DTG variable
     - size (int): the number of values in the domain
-    - arcs (defaultdict: int -> set(int)): the DTG arcs (unlabeled)
+    - arcs (defaultdict: int -> set(int)): the DTG arcs (unlabelled)
 
     There are no transition labels or goal values.
 
diff --git a/predicators/utils.py b/predicators/utils.py
index f12ba169e7..7293848f78 100644
--- a/predicators/utils.py
+++ b/predicators/utils.py
@@ -36,6 +36,7 @@
 except ModuleNotFoundError:  # pragma: no cover
     _TTS_AVAILABLE = False
 
+import dill as pkl
 import imageio
 import matplotlib
 import matplotlib.pyplot as plt
@@ -2694,6 +2695,59 @@ def prune_ground_atom_dataset(
     return new_ground_atom_dataset
 
 
+def load_ground_atom_dataset(
+        dataset_fname: str,
+        trajectories: List[LowLevelTrajectory]) -> List[GroundAtomTrajectory]:
+    """Load a previously-saved ground atom dataset.
+
+    Note importantly that we only save atoms themselves, we don't save
+    the low-level trajectory information that's necessary to make
+    GroundAtomTrajectories given series of ground atoms (that info can
+    be saved separately, in case one wants to just load trajectories and
+    not also load ground atoms). Thus, this function needs to take these
+    trajectories as input.
+    """
+    os.makedirs(CFG.data_dir, exist_ok=True)
+    # Check that the dataset file was previously saved.
+    ground_atom_dataset_atoms: Optional[List[List[Set[GroundAtom]]]] = []
+    if os.path.exists(dataset_fname):
+        # Load the ground atoms dataset.
+        with open(dataset_fname, "rb") as f:
+            ground_atom_dataset_atoms = pkl.load(f)
+        assert ground_atom_dataset_atoms is not None
+        assert len(trajectories) == len(ground_atom_dataset_atoms)
+        logging.info("\n\nLOADED GROUND ATOM DATASET")
+
+        # The saved ground atom dataset consists only of sequences
+        # of sets of GroundAtoms, we need to recombine this with
+        # the LowLevelTrajectories to create a GroundAtomTrajectory.
+        ground_atom_dataset = []
+        for i, traj in enumerate(trajectories):
+            ground_atom_seq = ground_atom_dataset_atoms[i]
+            ground_atom_dataset.append(
+                (traj, [set(atoms) for atoms in ground_atom_seq]))
+    else:
+        raise ValueError(f"Cannot load ground atoms: {dataset_fname}")
+    return ground_atom_dataset
+
+
+def save_ground_atom_dataset(ground_atom_dataset: List[GroundAtomTrajectory],
+                             dataset_fname: str) -> None:
+    """Saves a given ground atom dataset so it can be loaded in the future."""
+    # Save ground atoms dataset to file. Note that a
+    # GroundAtomTrajectory contains a normal LowLevelTrajectory and a
+    # list of sets of GroundAtoms, so we only save the list of
+    # GroundAtoms (the LowLevelTrajectories are saved separately).
+    ground_atom_dataset_to_pkl = []
+    for gt_traj in ground_atom_dataset:
+        trajectory = []
+        for ground_atom_set in gt_traj[1]:
+            trajectory.append(ground_atom_set)
+        ground_atom_dataset_to_pkl.append(trajectory)
+    with open(dataset_fname, "wb") as f:
+        pkl.dump(ground_atom_dataset_to_pkl, f)
+
+
 def extract_preds_and_types(
     ops: Collection[NSRTOrSTRIPSOperator]
 ) -> Tuple[Dict[str, Predicate], Dict[str, Type]]:
@@ -3258,12 +3312,22 @@ def get_env_asset_path(asset_name: str, assert_exists: bool = True) -> str:
 
 def get_third_party_path() -> str:
     """Return the absolute path to the third party directory."""
-    module_path = Path(__file__)
-    predicators_dir = module_path.parent
-    third_party_dir_path = os.path.join(predicators_dir, "third_party")
+    third_party_dir_path = os.path.join(get_path_to_predicators_root(),
+                                        "predicators/third_party")
     return third_party_dir_path
 
 
+def get_path_to_predicators_root() -> str:
+    """Return the absolute path to the predicators root directory.
+
+    Specifically, this returns something that looks like:
+    '<installation-path>/predicators'. Note there is no '/' at the end.
+    """
+    module_path = Path(__file__)
+    predicators_dir = module_path.parent.parent
+    return str(predicators_dir)
+
+
 def import_submodules(path: List[str], name: str) -> None:
     """Load all submodules on the given path.
 
@@ -3605,13 +3669,34 @@ def find_all_balanced_expressions(s: str) -> List[str]:
     return exprs
 
 
-def f_range_intersection(lb1: float, ub1: float, lb2: float,
-                         ub2: float) -> bool:
-    """Given upper and lower bounds for two feature ranges, returns True iff
-    the ranges intersect."""
+def range_intersection(lb1: float, ub1: float, lb2: float, ub2: float) -> bool:
+    """Given upper and lower bounds for two ranges, returns True iff the ranges
+    intersect."""
     return (lb1 <= lb2 <= ub1) or (lb2 <= lb1 <= ub2)
 
 
+def compute_abs_range_given_two_ranges(lb1: float, ub1: float, lb2: float,
+                                       ub2: float) -> Tuple[float, float]:
+    """Given upper and lower bounds of two feature ranges, returns the upper.
+
+    and lower bound of |f1 - f2|.
+    """
+    # Now, we must compute the upper and lower bounds of
+    # the expression |t1.f1 - t2.f2|. If the intervals
+    # [lb1, ub1] and [lb2, ub2] overlap, then the lower
+    # bound of the expression is just 0. Otherwise, if
+    # lb2 > ub1, the lower bound is |ub1 - lb2|, and if
+    # ub2 < lb1, the lower bound is |lb1 - ub2|.
+    if range_intersection(lb1, ub1, lb2, ub2):
+        lb = 0.0
+    else:
+        lb = min(abs(lb2 - ub1), abs(lb1 - ub2))
+    # The upper bound for the expression can be
+    # computed in a similar fashion.
+    ub = max(abs(ub2 - lb1), abs(ub1 - lb2))
+    return (lb, ub)
+
+
 def roundrobin(iterables: Sequence[Iterator]) -> Iterator:
     """roundrobin(['ABC...', 'D...', 'EF...']) --> A D E B F C..."""
     # Recipe credited to George Sakkis, code adapted slightly from
diff --git a/scripts/plotting/create_classification_plots.py b/scripts/plotting/create_classification_plots.py
index 95e5aacb4b..bafc3ea5e6 100644
--- a/scripts/plotting/create_classification_plots.py
+++ b/scripts/plotting/create_classification_plots.py
@@ -159,7 +159,7 @@ def heatmap(data: Array,
     ax.set_yticks(yticks)
     ax.set_xticklabels(xtick_labels)
     ax.set_yticklabels(ytick_labels)
-    # Let the horizontal axes labeling appear on top
+    # Let the horizontal axes labelling appear on top
     ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)
     # Rotate the tick labels and set their alignment
     plt.setp(ax.get_xticklabels(),
diff --git a/setup.py b/setup.py
index 0f06aeceb9..e35789bffb 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,9 @@
         "opencv-python == 4.7.0.72",
         "pg3@git+https://github.com/tomsilver/pg3.git",
         "gym_sokoban@git+https://github.com/Learning-and-Intelligent-Systems/gym-sokoban.git",  # pylint: disable=line-too-long
-        "pbrspot@git+https://github.com/NishanthJKumar/pbrspot.git"
+        "pbrspot@git+https://github.com/NishanthJKumar/pbrspot.git",
+        "ImageHash",
+        "google-generativeai"
     ],
     include_package_data=True,
     extras_require={
diff --git a/tests/approaches/test_grammar_search_invention_approach.py b/tests/approaches/test_grammar_search_invention_approach.py
index 529edd6c9b..11ff701eb6 100644
--- a/tests/approaches/test_grammar_search_invention_approach.py
+++ b/tests/approaches/test_grammar_search_invention_approach.py
@@ -9,11 +9,14 @@
 from predicators.approaches.grammar_search_invention_approach import \
     GrammarSearchInventionApproach, _AttributeDiffCompareClassifier, \
     _create_grammar, _DataBasedPredicateGrammar, \
+    _EuclideanAttributeDiffCompareClassifier, \
+    _EuclideanDistancePredicateGrammar, \
     _FeatureDiffInequalitiesPredicateGrammar, _ForallClassifier, \
     _halving_constant_generator, _NegationClassifier, _PredicateGrammar, \
     _SingleAttributeCompareClassifier, \
     _SingleFeatureInequalitiesPredicateGrammar, _UnaryFreeForallClassifier
 from predicators.envs.cover import CoverEnv
+from predicators.envs.stick_button import StickButtonMovementEnv
 from predicators.ground_truth_models import get_gt_options
 from predicators.settings import CFG
 from predicators.structs import Action, Dataset, LowLevelTrajectory, Object, \
@@ -49,6 +52,8 @@ def test_predicate_grammar(segmenter):
     env = CoverEnv()
     single_ineq_grammar = _SingleFeatureInequalitiesPredicateGrammar(dataset)
     diff_ineq_grammar = _FeatureDiffInequalitiesPredicateGrammar(dataset)
+    euclidean_grammar = _EuclideanDistancePredicateGrammar(
+        dataset, "x", "y", "x", "y")
     assert len(single_ineq_grammar.generate(max_num=1)) == 1
     sing_feature_ranges = single_ineq_grammar._get_feature_ranges()  # pylint: disable=protected-access
     assert sing_feature_ranges[robby.type]["hand"] == (0.5, 0.8)
@@ -56,6 +61,15 @@ def test_predicate_grammar(segmenter):
     doub_feature_ranges = diff_ineq_grammar._get_feature_ranges()  # pylint: disable=protected-access
     assert doub_feature_ranges[robby.type]["hand"] == (0.5, 0.8)
     assert doub_feature_ranges[block.type]["grasp"] == (-1, 1)
+    euclidean_feature_ranges = euclidean_grammar._get_feature_ranges()  # pylint: disable=protected-access
+    assert euclidean_feature_ranges[block.type]["is_block"] == (1.0, 1.0)
+    assert euclidean_feature_ranges[robby.type]["hand"] == (0.5, 0.8)
+
+    # Generate from the diff ineq grammar and verify that the number of
+    # candidates generated is under the limit.
+    preds = diff_ineq_grammar.generate(max_num=100)
+    assert len(preds) <= 100
+
     forall_grammar = _create_grammar(dataset, env.predicates)
     # Test edge case where there are no low-level features in the dataset.
     dummy_type = Type("dummy", [])
@@ -69,8 +83,11 @@ def test_predicate_grammar(segmenter):
         dummy_dataset)
     dummy_doub_grammar = _FeatureDiffInequalitiesPredicateGrammar(
         dummy_dataset)
+    dummy_euc_grammar = _EuclideanDistancePredicateGrammar(
+        dummy_dataset, "x", "y", "x", "y")
     assert len(dummy_sing_grammar.generate(max_num=1)) == 0
     assert len(dummy_doub_grammar.generate(max_num=1)) == 0
+    assert len(dummy_euc_grammar.generate(max_num=1)) == 0
     # There are only so many unique predicates possible under the grammar.
     # Non-unique predicates are pruned. Note that with a larger dataset,
     # more predicates would appear unique.
@@ -80,10 +97,10 @@ def test_predicate_grammar(segmenter):
     utils.reset_config({
         "grammar_search_grammar_use_diff_features": True,
         "segmenter": segmenter,
-        "env": "cover"
+        "env": "cover",
     })
     forall_grammar = _create_grammar(dataset, env.predicates)
-    assert len(forall_grammar.generate(max_num=100)) == 55
+    assert len(forall_grammar.generate(max_num=100)) == 9
     # Test CFG.grammar_search_predicate_cost_upper_bound.
     default = CFG.grammar_search_predicate_cost_upper_bound
     utils.reset_config({"grammar_search_predicate_cost_upper_bound": 0})
@@ -107,6 +124,75 @@ def test_predicate_grammar(segmenter):
     utils.update_config({"grammar_search_use_handcoded_debug_grammar": False})
 
 
+def test_labelled_atoms_invention():
+    """Tests for _PredicateGrammar class."""
+    utils.reset_config({
+        "env": "cover",
+        "offline_data_method": "demo+labelled_atoms"
+    })
+    env = CoverEnv()
+    train_task = env.get_train_tasks()[0].task
+    state = train_task.init
+    other_state = state.copy()
+    robby = [o for o in state if o.type.name == "robot"][0]
+    block = [o for o in state if o.name == "block0"][0]
+    state.set(robby, "hand", 0.5)
+    other_state.set(robby, "hand", 0.8)
+    state.set(block, "grasp", -1)
+    other_state.set(block, "grasp", 1)
+    preds = env.predicates
+    assert len(preds) == 5
+    ground_atoms = []
+    for s in [state, other_state]:
+        curr_state_atoms = utils.abstract(s, preds)
+        ground_atoms.append(curr_state_atoms)
+
+    ll_trajs = [
+        LowLevelTrajectory([state, other_state],
+                           [Action(np.zeros(1, dtype=np.float32))])
+    ]
+    dataset = Dataset(ll_trajs, [ground_atoms])
+
+    approach = GrammarSearchInventionApproach(env.predicates,
+                                              get_gt_options(env.get_name()),
+                                              env.types, env.action_space,
+                                              [train_task])
+
+    with pytest.raises(AssertionError):
+        # The below command should fail because even though it should be able
+        # to extract predicates from the dataset, the trajectories' actions
+        # don't have options that can be used.
+        approach.learn_from_offline_dataset(dataset)
+
+
+def test_euclidean_grammar():
+    """Tests for the EuclideanGrammar."""
+    utils.reset_config({"env": "stick_button_move"})
+    env = StickButtonMovementEnv()
+    train_task = env.get_train_tasks()[0].task
+    state = train_task.init
+    other_state = state.copy()
+    robby = [o for o in state if o.type.name == "robot"][0]
+    curr_x = state.get(robby, "x")
+    curr_y = state.get(robby, "y")
+    other_state.set(robby, "x", curr_x + 0.05)
+    other_state.set(robby, "y", curr_y + 0.05)
+    dataset = Dataset([
+        LowLevelTrajectory([state, other_state],
+                           [Action(np.zeros(4, dtype=np.float32))])
+    ])
+    utils.reset_config({
+        "grammar_search_grammar_use_euclidean_dist": True,
+        "segmenter": "atom_changes",
+    })
+    grammar = _create_grammar(dataset, env.predicates)
+    assert len(grammar.generate(max_num=100)) == 28
+    utils.reset_config({
+        "grammar_search_grammar_use_euclidean_dist": False,
+        "segmenter": "contacts"
+    })
+
+
 def test_halving_constant_generator():
     """Tests for _halving_constant_generator()."""
     expected_constants = [0.5, 0.25, 0.75, 0.125, 0.625, 0.375, 0.875]
@@ -204,6 +290,17 @@ def test_unary_free_forall_classifier():
                                         "¬(∀ ?y:plate_type . ¬On(?x, ?y))")
 
 
+def test_euclidean_classifier_and_grammar():
+    """Tests for the _EuclideanAttributeDiffCompareClassifier and certain
+    aspects of the euclidean grammar."""
+    a_type = Type("a_type", ["x", "y"])
+    b_type = Type("b_type", ["x", "y"])
+    classifier0 = _EuclideanAttributeDiffCompareClassifier(
+        0, a_type, "x", "y", 1, b_type, "x", "y", 1.0, 0, gt, ">")
+    assert classifier0.pretty_str() == (
+        '?x:a_type, ?y:b_type', '((?x.x - ?y.x)^2  + ((?x.y - ?y.y)^2 > 1.0)')
+
+
 def test_unrecognized_clusterer():
     """Tests that a dummy name for the 'clusterer' argument will trigger a
     failure.
diff --git a/tests/approaches/test_llm_bilevel_planning_approach.py b/tests/approaches/test_llm_bilevel_planning_approach.py
index f49cd51195..33e304adf2 100644
--- a/tests/approaches/test_llm_bilevel_planning_approach.py
+++ b/tests/approaches/test_llm_bilevel_planning_approach.py
@@ -8,7 +8,7 @@
 from predicators.datasets import create_dataset
 from predicators.envs import create_new_env
 from predicators.ground_truth_models import get_gt_options
-from predicators.llm_interface import LargeLanguageModel
+from predicators.pretrained_model_interface import LargeLanguageModel
 
 
 def test_llm_bilevel_planning_approach():
@@ -17,7 +17,7 @@ def test_llm_bilevel_planning_approach():
     cache_dir = "_fake_llm_cache_dir"
     utils.reset_config({
         "env": env_name,
-        "llm_prompt_cache_dir": cache_dir,
+        "pretrained_model_prompt_cache_dir": cache_dir,
         "approach": "llm_bilevel_planning",
         "num_train_tasks": 1,
         "num_test_tasks": 1,
@@ -48,11 +48,12 @@ def get_id(self):
 
         def _sample_completions(self,
                                 prompt,
+                                imgs,
                                 temperature,
                                 seed,
                                 stop_token=None,
                                 num_completions=1):
-            del prompt, temperature, seed, stop_token, num_completions
+            del prompt, temperature, seed, stop_token, num_completions, imgs
             return [self.response]
 
     llm = _MockLLM()
diff --git a/tests/approaches/test_llm_open_loop_approach.py b/tests/approaches/test_llm_open_loop_approach.py
index 323df5d2f1..9ff484e330 100644
--- a/tests/approaches/test_llm_open_loop_approach.py
+++ b/tests/approaches/test_llm_open_loop_approach.py
@@ -10,7 +10,7 @@
 from predicators.datasets import create_dataset
 from predicators.envs import create_new_env
 from predicators.ground_truth_models import get_gt_options
-from predicators.llm_interface import LargeLanguageModel
+from predicators.pretrained_model_interface import LargeLanguageModel
 
 
 def test_llm_open_loop_approach():
@@ -19,7 +19,7 @@ def test_llm_open_loop_approach():
     cache_dir = "_fake_llm_cache_dir"
     utils.reset_config({
         "env": env_name,
-        "llm_prompt_cache_dir": cache_dir,
+        "pretrained_model_prompt_cache_dir": cache_dir,
         "approach": "llm_open_loop",
         "num_train_tasks": 1,
         "num_test_tasks": 1,
@@ -51,11 +51,12 @@ def get_id(self):
 
         def _sample_completions(self,
                                 prompt,
+                                imgs,
                                 temperature,
                                 seed,
                                 stop_token=None,
                                 num_completions=1):
-            del prompt, temperature, seed, stop_token, num_completions
+            del prompt, temperature, seed, stop_token, num_completions, imgs
             return [self.response]
 
     llm = _MockLLM()
diff --git a/tests/approaches/test_nsrt_learning_approach.py b/tests/approaches/test_nsrt_learning_approach.py
index 485479ff16..6050cd2915 100644
--- a/tests/approaches/test_nsrt_learning_approach.py
+++ b/tests/approaches/test_nsrt_learning_approach.py
@@ -359,7 +359,17 @@ def test_grammar_search_invention_approach():
         "grammar_search_search_algorithm": "hill_climbing",
         "pretty_print_when_loading": True,
         "grammar_search_gbfs_num_evals": 1,
+        "save_atoms": True
     }
+    _test_approach(env_name="cover",
+                   approach_name="grammar_search_invention",
+                   excluded_predicates="Holding",
+                   try_solving=False,
+                   sampler_learner="random",
+                   num_train_tasks=3,
+                   additional_settings=additional_settings)
+    # Now test loading.
+    additional_settings.update({"load_atoms": True})
     _test_approach(env_name="cover",
                    approach_name="grammar_search_invention",
                    excluded_predicates="Holding",
@@ -370,6 +380,7 @@ def test_grammar_search_invention_approach():
     # Test approach with unrecognized search algorithm.
     additional_settings["grammar_search_search_algorithm"] = \
         "not a real search algorithm"
+    additional_settings["load_atoms"] = False
     with pytest.raises(Exception) as e:
         _test_approach(env_name="cover",
                        approach_name="grammar_search_invention",
diff --git a/tests/approaches/test_oracle_approach.py b/tests/approaches/test_oracle_approach.py
index d97ea3766e..482b6cb82b 100644
--- a/tests/approaches/test_oracle_approach.py
+++ b/tests/approaches/test_oracle_approach.py
@@ -33,7 +33,8 @@
 from predicators.envs.sandwich import SandwichEnv
 from predicators.envs.satellites import SatellitesEnv, SatellitesSimpleEnv
 from predicators.envs.screws import ScrewsEnv
-from predicators.envs.stick_button import StickButtonEnv
+from predicators.envs.stick_button import StickButtonEnv, \
+    StickButtonMovementEnv
 from predicators.envs.tools import ToolsEnv
 from predicators.envs.touch_point import TouchOpenEnv, TouchPointEnv, \
     TouchPointEnvParam
@@ -46,43 +47,32 @@
 _PDDL_ENV_MODULE_PATH = predicators.envs.pddl_env.__name__
 
 ENV_NAME_AND_CLS = [
-    ("cover", CoverEnv),
-    ("cover_typed_options", CoverEnvTypedOptions),
+    ("cover", CoverEnv), ("cover_typed_options", CoverEnvTypedOptions),
     ("cover_place_hard", CoverEnvPlaceHard),
     ("cover_hierarchical_types", CoverEnvHierarchicalTypes),
-    ("cover_regrasp", CoverEnvRegrasp),
-    ("bumpy_cover", BumpyCoverEnv),
+    ("cover_regrasp", CoverEnvRegrasp), ("bumpy_cover", BumpyCoverEnv),
     ("cover_multistep_options", CoverMultistepOptions),
     ("regional_bumpy_cover", RegionalBumpyCoverEnv),
     ("cluttered_table", ClutteredTableEnv),
-    ("cluttered_table_place", ClutteredTablePlaceEnv),
-    ("blocks", BlocksEnv),
-    ("exit_garage", ExitGarageEnv),
-    ("narrow_passage", NarrowPassageEnv),
-    ("painting", PaintingEnv),
-    ("sandwich", SandwichEnv),
-    ("tools", ToolsEnv),
-    ("playroom", PlayroomEnv),
-    ("repeated_nextto", RepeatedNextToEnv),
+    ("cluttered_table_place", ClutteredTablePlaceEnv), ("blocks", BlocksEnv),
+    ("exit_garage", ExitGarageEnv), ("narrow_passage", NarrowPassageEnv),
+    ("painting", PaintingEnv), ("sandwich", SandwichEnv), ("tools", ToolsEnv),
+    ("playroom", PlayroomEnv), ("repeated_nextto", RepeatedNextToEnv),
     ("repeated_nextto_single_option", RepeatedNextToSingleOptionEnv),
     ("repeated_nextto_ambiguous", RepeatedNextToAmbiguousEnv),
     ("repeated_nextto_simple", RepeatedNextToSimple),
-    ("satellites", SatellitesEnv),
-    ("satellites_simple", SatellitesSimpleEnv),
+    ("satellites", SatellitesEnv), ("satellites_simple", SatellitesSimpleEnv),
     ("screws", ScrewsEnv),
     ("repeated_nextto_painting", RepeatedNextToPaintingEnv),
     ("pddl_blocks_fixed_tasks", FixedTasksBlocksPDDLEnv),
     ("pddl_blocks_procedural_tasks", ProceduralTasksBlocksPDDLEnv),
     ("pddl_delivery_procedural_tasks", ProceduralTasksDeliveryPDDLEnv),
     ("pddl_easy_delivery_procedural_tasks",
-     ProceduralTasksEasyDeliveryPDDLEnv),
-    ("touch_point", TouchPointEnv),
-    ("touch_point_param", TouchPointEnvParam),
-    ("touch_open", TouchOpenEnv),
+     ProceduralTasksEasyDeliveryPDDLEnv), ("touch_point", TouchPointEnv),
+    ("touch_point_param", TouchPointEnvParam), ("touch_open", TouchOpenEnv),
     ("stick_button", StickButtonEnv),
-    ("doors", DoorsEnv),
-    ("coffee", CoffeeEnv),
-    ("pybullet_blocks", PyBulletBlocksEnv),
+    ("stick_button_move", StickButtonMovementEnv), ("doors", DoorsEnv),
+    ("coffee", CoffeeEnv), ("pybullet_blocks", PyBulletBlocksEnv)
 ]
 
 # For each environment name in ENV_NAME_AND_CLS, a list of additional
diff --git a/tests/datasets/mock_vlm_datasets/ice_tea_making__demo+labelled_atoms__manual__1.txt b/tests/datasets/mock_vlm_datasets/ice_tea_making__demo+labelled_atoms__manual__1.txt
new file mode 100644
index 0000000000..ac19194123
--- /dev/null
+++ b/tests/datasets/mock_vlm_datasets/ice_tea_making__demo+labelled_atoms__manual__1.txt
@@ -0,0 +1,44 @@
+===
+{hand_grasping_spoon(hand, spoon): True.
+hand_grasping_teabag(hand, teabag): True.
+spoon_in_cup(spoon, cup): True.
+spoon_on_plate(spoon, plate): True.
+teabag_in_cup(teabag, cup): True.
+teabag_on_plate(teabag, plate): True.} ->
+
+pick(teabag, hand)[] -> 
+
+{hand_grasping_spoon(hand, spoon): True.
+hand_grasping_teabag(hand, teabag): True.
+spoon_in_cup(spoon, cup): True.
+spoon_on_plate(spoon, plate): True.
+teabag_in_cup(teabag, cup): True.
+teabag_on_plate(teabag, plate): True.} ->
+
+place_in(teabag, cup)[] -> 
+
+{hand_grasping_spoon(hand, spoon): True.
+hand_grasping_teabag(hand, teabag): True.
+spoon_in_cup(spoon, cup): True.
+spoon_on_plate(spoon, plate): True.
+teabag_in_cup(teabag, cup): True.
+teabag_on_plate(teabag, plate): True.} ->
+
+pick(spoon, hand)[] -> 
+
+{hand_grasping_spoon(hand, spoon): True.
+hand_grasping_teabag(hand, teabag): True.
+spoon_in_cup(spoon, cup): True.
+spoon_on_plate(spoon, plate): True.
+teabag_in_cup(teabag, cup): True.
+teabag_on_plate(teabag, plate): True.} ->
+
+place_in(spoon, cup)[] -> 
+
+{hand_grasping_spoon(hand, spoon): True.
+hand_grasping_teabag(hand, teabag): True.
+spoon_in_cup(spoon, cup): True.
+spoon_on_plate(spoon, plate): True.
+teabag_in_cup(teabag, cup): True.
+teabag_on_plate(teabag, plate): True.}
+===
diff --git a/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/0/IMG_3779.jpg b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/0/IMG_3779.jpg
new file mode 100644
index 0000000000..07ef3c08d6
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/0/IMG_3779.jpg differ
diff --git a/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/1/IMG_3780.jpg b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/1/IMG_3780.jpg
new file mode 100644
index 0000000000..b703c77836
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/1/IMG_3780.jpg differ
diff --git a/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/2/IMG_3781.jpg b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/2/IMG_3781.jpg
new file mode 100644
index 0000000000..4d01c1332c
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/2/IMG_3781.jpg differ
diff --git a/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/3/IMG_3782.jpg b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/3/IMG_3782.jpg
new file mode 100644
index 0000000000..ecc6c2f0ff
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/3/IMG_3782.jpg differ
diff --git a/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/4/IMG_3783.jpg b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/4/IMG_3783.jpg
new file mode 100644
index 0000000000..4a362c12b1
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/4/IMG_3783.jpg differ
diff --git a/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/options_traj.txt b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/options_traj.txt
new file mode 100644
index 0000000000..97226f37ca
--- /dev/null
+++ b/tests/datasets/mock_vlm_datasets/ice_tea_making__vlm_demos__456__1/traj_0/options_traj.txt
@@ -0,0 +1,4 @@
+pick(teabag, hand, [])
+place_in(teabag, cup, [])
+pick(spoon, hand, [])
+place_in(spoon, cup, [])
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
index c801544616..98895a59a5 100644
--- a/tests/datasets/test_datasets.py
+++ b/tests/datasets/test_datasets.py
@@ -7,15 +7,57 @@
 
 from predicators import utils
 from predicators.datasets import create_dataset
+from predicators.datasets.generate_atom_trajs_with_vlm import \
+    create_ground_atom_data_from_img_trajs
 from predicators.envs.blocks import BlocksEnv
 from predicators.envs.cluttered_table import ClutteredTableEnv
 from predicators.envs.cover import CoverEnv, CoverMultistepOptions
+from predicators.envs.vlm_envs import IceTeaMakingEnv
 from predicators.ground_truth_models import _get_predicates_by_names, \
     get_gt_options, parse_config_included_options
+from predicators.pretrained_model_interface import VisionLanguageModel
 from predicators.settings import CFG
 from predicators.structs import Dataset, GroundAtom, Task
 
 
+class _DummyVLM(VisionLanguageModel):
+
+    def get_id(self):
+        return "dummy"
+
+    def _sample_completions(self,
+                            prompt,
+                            imgs,
+                            temperature,
+                            seed,
+                            stop_token=None,
+                            num_completions=1):
+        del imgs  # unused.
+        completions = []
+        for _ in range(num_completions):
+            # If the query is asking for atom proposals.
+            if "Please provide predicates" in prompt:
+                completion = "*Holding(spoon)\n*Fizz(buzz)\n" + \
+                    "Submerged(teabag)\nSubmerged(spoon)"
+            # Else, if the query is asking for particular values.
+            elif "values of the following predicates" in prompt:
+                # Completion for default predicates.
+                if "Submerged" in prompt:
+                    completion = "*Holding(spoon): True.\n" + \
+                        "*Submerged(teabag): False.\n*Submerged(spoon): False."
+                # Completion for debug predicates
+                else:
+                    completion = ("hand_grasping_spoon(hand, spoon): True.\n"
+                                  "hand_grasping_teabag(hand, teabag): True.\n"
+                                  "spoon_in_cup(spoon, cup): True.\n"
+                                  "spoon_on_plate(spoon, plate): True.\n"
+                                  "teabag_in_cup(teabag, cup): True.\n"
+                                  "teabag_on_plate(teabag, plate): True.")
+
+            completions.append(completion)
+        return completions
+
+
 def test_demo_dataset():
     """Test demo-only dataset creation with Covers env."""
     # Test that data does not contain options since
@@ -465,3 +507,105 @@ def test_empty_dataset():
     assert len(dataset.trajectories) == 0
     with pytest.raises(AssertionError):
         _ = dataset.annotations
+
+
+@pytest.mark.parametrize(
+    "atom_proposal_prompt_type, atom_labelling_prompt_type",
+    [("naive_each_step", "per_scene_naive"),
+     ("options_labels_whole_traj", "per_scene_naive"),
+     ("naive_whole_traj", "per_scene_cot"),
+     ("not_a_real_prompt_type", "per_scene_cot"),
+     ("naive_whole_traj", "not_a_real_prompt_type")])
+def test_loading_img_demos(atom_proposal_prompt_type,
+                           atom_labelling_prompt_type):
+    """Test loading a dataset from img demo files."""
+    utils.reset_config({
+        "env":
+        "ice_tea_making",
+        "num_train_tasks":
+        1,
+        "offline_data_method":
+        "img_demos",
+        "data_dir":
+        "tests/datasets/mock_vlm_datasets",
+        "seed":
+        456,
+        "vlm_trajs_folder_name":
+        "ice_tea_making__vlm_demos__456__1",
+        "grammar_search_vlm_atom_proposal_prompt_type":
+        atom_proposal_prompt_type,
+        "grammar_search_vlm_atom_label_prompt_type":
+        atom_labelling_prompt_type,
+        "pretrained_model_prompt_cache_dir":
+        "tests/datasets/mock_vlm_datasets/cache"
+    })
+    env = IceTeaMakingEnv()
+    train_tasks = env.get_train_tasks()
+    vlm = _DummyVLM()
+    if atom_proposal_prompt_type != "not_a_real_prompt_type" and \
+        atom_labelling_prompt_type != "not_a_real_prompt_type":
+        loaded_dataset = create_ground_atom_data_from_img_trajs(
+            env, train_tasks, get_gt_options(env.get_name()), vlm)
+        assert len(loaded_dataset.trajectories) == 1
+        assert len(loaded_dataset.annotations) == 1
+        assert len(loaded_dataset.annotations[0][0]) == 1
+        assert "Holding(spoon:spoon)" in str(loaded_dataset.annotations[0][0])
+        assert "DummyGoal" in str(loaded_dataset.annotations[0][-1])
+    else:
+        with pytest.raises(ValueError) as e:
+            loaded_dataset = create_ground_atom_data_from_img_trajs(
+                env, train_tasks, get_gt_options(env.get_name()), vlm)
+        assert "Unknown" in str(e)
+    for dirpath, _, filenames in os.walk(
+            CFG.pretrained_model_prompt_cache_dir):
+        # Remove regular files, ignore directories
+        for filename in filenames:
+            os.unlink(os.path.join(dirpath, filename))
+
+
+def test_env_debug_grammar():
+    """Test loading a dataset from img demo files when the debug grammar is
+    turned on."""
+    utils.reset_config({
+        "env": "ice_tea_making",
+        "num_train_tasks": 1,
+        "offline_data_method": "img_demos",
+        "data_dir": "tests/datasets/mock_vlm_datasets",
+        "seed": 456,
+        "vlm_trajs_folder_name": "ice_tea_making__vlm_demos__456__1",
+        "grammar_search_vlm_atom_proposal_prompt_type":
+        "options_labels_whole_traj",
+        "grammar_search_vlm_atom_label_prompt_type": "per_scene_naive",
+        "grammar_search_vlm_atom_proposal_use_debug": True
+    })
+    env = IceTeaMakingEnv()
+    train_tasks = env.get_train_tasks()
+    vlm = _DummyVLM()
+    loaded_dataset = create_ground_atom_data_from_img_trajs(
+        env, train_tasks, get_gt_options(env.get_name()), vlm)
+    assert len(loaded_dataset.trajectories) == 1
+    assert len(loaded_dataset.annotations) == 1
+    assert len(loaded_dataset.annotations[0][0]) == 6
+    assert "hand_grasping_spoon" in str(loaded_dataset.annotations[0][0])
+    assert "DummyGoal" in str(loaded_dataset.annotations[0][-1])
+
+
+def test_loading_txt_files():
+    """Test loading a dataset from a txt file."""
+    utils.reset_config({
+        "env":
+        "ice_tea_making",
+        "num_train_tasks":
+        1,
+        "offline_data_method":
+        "demo+labelled_atoms",
+        "data_dir":
+        "tests/datasets/mock_vlm_datasets",
+        "handmade_demo_filename":
+        "ice_tea_making__demo+labelled_atoms__manual__1.txt"
+    })
+    env = IceTeaMakingEnv()
+    train_tasks = env.get_train_tasks()
+    loaded_dataset = create_dataset(env, train_tasks,
+                                    get_gt_options(env.get_name()))
+    assert len(loaded_dataset.trajectories) == 1
diff --git a/tests/envs/test_blocks.py b/tests/envs/test_blocks.py
index af074a21cc..831d22a694 100644
--- a/tests/envs/test_blocks.py
+++ b/tests/envs/test_blocks.py
@@ -8,12 +8,13 @@
 import pytest
 
 import predicators.envs.blocks
+import predicators.pretrained_model_interface
 from predicators import utils
 from predicators.envs.blocks import BlocksEnv, BlocksEnvClear
 from predicators.ground_truth_models import get_gt_options
 
 _ENV_MODULE_PATH = predicators.envs.blocks.__name__
-_LLM_MODULE_PATH = predicators.llm_interface.__name__
+_LLM_MODULE_PATH = predicators.pretrained_model_interface.__name__
 
 
 def test_blocks():
diff --git a/tests/envs/test_sandwich.py b/tests/envs/test_sandwich.py
index 4e3520323d..37814886ac 100644
--- a/tests/envs/test_sandwich.py
+++ b/tests/envs/test_sandwich.py
@@ -7,14 +7,14 @@
 import numpy as np
 import pytest
 
-import predicators.llm_interface
+import predicators.pretrained_model_interface
 from predicators import utils
 from predicators.envs import create_new_env
 from predicators.envs.sandwich import SandwichEnv
 from predicators.ground_truth_models import get_gt_options
 from predicators.structs import Action, GroundAtom
 
-_LLM_MODULE_PATH = predicators.llm_interface.__name__
+_LLM_MODULE_PATH = predicators.pretrained_model_interface.__name__
 
 
 def test_sandwich_properties():
diff --git a/tests/envs/test_stick_button.py b/tests/envs/test_stick_button.py
index 46f015c125..efe01e478f 100644
--- a/tests/envs/test_stick_button.py
+++ b/tests/envs/test_stick_button.py
@@ -5,7 +5,8 @@
 import pytest
 
 from predicators import utils
-from predicators.envs.stick_button import StickButtonEnv
+from predicators.envs.stick_button import StickButtonEnv, \
+    StickButtonMovementEnv
 from predicators.ground_truth_models import get_gt_nsrts, get_gt_options
 from predicators.structs import Action, EnvironmentTask, GroundAtom
 
@@ -175,7 +176,7 @@ def test_stick_button():
     assert StickPressButton.name == "StickPressButton"
 
     # Test RobotPressButton.
-    option = RobotPressButton.ground([robot, reachable_button], [])
+    option = RobotPressButton.ground([robot, reachable_button, stick], [])
     option_plan = [option]
 
     policy = utils.option_plan_to_policy(option_plan)
@@ -306,3 +307,194 @@ def test_stick_button():
     rng = np.random.default_rng(123)
     option = ground_nsrt.sample_option(state, set(), rng)
     assert -1 <= option.params[0] <= 1
+
+
+def test_stick_button_move():
+    """Tests for the movement variant of stick button."""
+    utils.reset_config({
+        "env": "stick_button_move",
+        "stick_button_num_buttons_train": [2],
+        "stick_button_disable_angles": False,
+        "stick_button_holder_scale": 0.001,
+    })
+    env = StickButtonMovementEnv()
+    for task in env.get_train_tasks():
+        for obj in task.init:
+            assert len(obj.type.feature_names) == len(task.init[obj])
+    for task in env.get_test_tasks():
+        for obj in task.init:
+            assert len(obj.type.feature_names) == len(task.init[obj])
+    assert len(env.predicates) == 6
+    assert len(env.goal_predicates) == 1
+    AboveNoButton = [p for p in env.predicates if p.name == "AboveNoButton"][0]
+    assert {pred.name for pred in env.goal_predicates} == {"Pressed"}
+    assert len(get_gt_options(env.get_name())) == 6
+    assert len(env.types) == 4
+    button_type, holder_type, robot_type, stick_type = sorted(env.types)
+    assert button_type.name == "button"
+    assert holder_type.name == "holder"
+    assert robot_type.name == "robot"
+    assert stick_type.name == "stick"
+    assert env.action_space.shape == (5, )
+    # Create a custom initial state, with the robot in the middle, one button
+    # reachable on the left, one button out of the reachable zone in the middle,
+    # and the stick on the right at a 45 degree angle.
+    state = env.get_train_tasks()[0].init.copy()
+    robot, = state.get_objects(robot_type)
+    stick, = state.get_objects(stick_type)
+    holder, = state.get_objects(holder_type)
+    buttons = state.get_objects(button_type)
+    assert len(buttons) == 2
+    robot_x = (env.rz_x_ub + env.rz_x_lb) / 2
+    state.set(robot, "x", robot_x)
+    state.set(robot, "y", (env.rz_y_ub + env.rz_y_lb) / 2)
+    state.set(robot, "theta", np.pi / 2)
+    state.set(robot, "fingers", 1.0)
+    reachable_button, unreachable_button = buttons
+    reachable_x = (env.rz_x_ub + env.rz_x_lb) / 4
+    state.set(reachable_button, "x", reachable_x)
+    state.set(reachable_button, "y", (env.rz_y_ub + env.rz_y_lb) / 2)
+    unreachable_x = robot_x
+    state.set(unreachable_button, "x", unreachable_x)
+    unreachable_y = 0.75 * env.y_ub
+    assert not env.rz_y_lb <= unreachable_y <= env.rz_y_ub
+    state.set(unreachable_button, "y", unreachable_y)
+    stick_x = 3 * (env.rz_x_ub + env.rz_x_lb) / 4
+    state.set(stick, "x", stick_x)
+    state.set(stick, "y", (env.rz_y_ub + env.rz_y_lb) / 4)
+    state.set(stick, "theta", np.pi / 4)
+    state.set(stick, "held", 0.0)
+
+    task = EnvironmentTask(state, task.goal)
+    env.render_state(state, task)
+    assert GroundAtom(AboveNoButton, []).holds(state)
+    ## Test options ##
+    options = get_gt_options(env.get_name())
+    PickStick, PlaceStick, RobotMoveToButton, RobotPressButton, \
+        StickMoveToButton, StickPressButton = sorted(options)
+    assert PickStick.name == "PickStick"
+    assert PlaceStick.name == "PlaceStick"
+    assert RobotPressButton.name == "RobotPressButton"
+    assert StickPressButton.name == "StickPressButton"
+    assert RobotMoveToButton.name == "RobotMoveToButton"
+    assert StickMoveToButton.name == "StickMoveToButton"
+    # Test PickStick.
+    option = PickStick.ground([robot, stick], [0.1])
+    option_plan = [option]
+
+    policy = utils.option_plan_to_policy(option_plan)
+    traj = utils.run_policy_with_simulator(
+        policy,
+        env.simulate,
+        task.init,
+        lambda _: False,
+        max_num_steps=1000,
+        exceptions_to_break_on={utils.OptionExecutionFailure})
+    assert traj.states[-2].get(stick, "held") < 0.5
+    assert traj.states[-2].get(robot, "fingers") > 0.5
+    assert traj.states[-1].get(stick, "held") > 0.5
+    assert traj.states[-1].get(robot, "fingers") <= 0.5
+
+    # Test StickPressButton without moving first to show it doesn't work.
+    option = StickPressButton.ground([robot, stick, unreachable_button], [])
+    bad_option_plan = option_plan[:]
+    bad_option_plan.append(option)
+
+    policy = utils.option_plan_to_policy(bad_option_plan)
+    traj = utils.run_policy_with_simulator(
+        policy,
+        env.simulate,
+        task.init,
+        lambda _: False,
+        max_num_steps=1000,
+        exceptions_to_break_on={utils.OptionExecutionFailure})
+    assert traj.states[-2].get(unreachable_button, "pressed") < 0.5
+    # assert pressing didn't work.
+    assert traj.states[-1].get(unreachable_button, "pressed") < 0.5
+
+    # Test StickPressButton properly with moving first.
+    option = StickMoveToButton.ground([robot, unreachable_button, stick], [])
+    option_plan.append(option)
+    option = StickPressButton.ground([robot, stick, unreachable_button], [])
+    option_plan.append(option)
+
+    policy = utils.option_plan_to_policy(option_plan)
+    traj = utils.run_policy_with_simulator(
+        policy,
+        env.simulate,
+        task.init,
+        lambda _: False,
+        max_num_steps=1000,
+        exceptions_to_break_on={utils.OptionExecutionFailure})
+    assert traj.states[-2].get(unreachable_button, "pressed") < 0.5
+    # assert pressing worked.
+    assert traj.states[-1].get(unreachable_button, "pressed") > 0.5
+
+    # Test PlaceStick
+    utils.reset_config({
+        "env": "stick_button_move",
+        "stick_button_num_buttons_train": [2],
+        "stick_button_disable_angles": True,
+        "stick_button_holder_scale": 0.1,
+    })
+    env = StickButtonMovementEnv()
+    state = env.get_train_tasks()[1].init.copy()
+    task = EnvironmentTask(state, task.goal)
+    robot, = state.get_objects(robot_type)
+    stick, = state.get_objects(stick_type)
+    holder, = state.get_objects(holder_type)
+    buttons = state.get_objects(button_type)
+    option_plan = [
+        PickStick.ground([robot, stick], [0.3]),
+        StickMoveToButton.ground([robot, buttons[0], stick], []),
+        StickPressButton.ground([robot, stick, buttons[0]], []),
+        PlaceStick.ground((robot, stick, holder), [0.4])
+    ]
+    policy = utils.option_plan_to_policy(option_plan)
+    traj = utils.run_policy_with_simulator(
+        policy,
+        env.simulate,
+        task.init,
+        lambda _: False,
+        max_num_steps=1000,
+        exceptions_to_break_on={utils.OptionExecutionFailure})
+    assert traj.states[-2].get(stick, "held") > 0.5
+    assert traj.states[-2].get(robot, "fingers") <= 0.5
+    assert traj.states[-1].get(stick, "held") < 0.5
+    assert traj.states[-1].get(robot, "fingers") > 0.5
+
+    # Special test for PlaceStick NSRT because it's not used by oracle.
+    nsrts = get_gt_nsrts(env.get_name(), env.predicates, options)
+    nsrt = next(iter(n for n in nsrts if n.name == "PlaceStickFromNothing"))
+    ground_nsrt = nsrt.ground([robot, stick, holder])
+    rng = np.random.default_rng(123)
+    option = ground_nsrt.sample_option(state, set(), rng)
+    assert -1 <= option.params[0] <= 1
+
+    # Test that an EnviromentFailure is raised if the robot tries to pick
+    # when colliding with the stick holder.
+    utils.reset_config({
+        "env": "stick_button",
+        "stick_button_num_buttons_train": [1],
+        "stick_button_disable_angles": True,
+        "stick_button_holder_scale": 0.1,
+    })
+    env = StickButtonMovementEnv()
+    # Create a custom initial state, with the robot right on top of the stick
+    # and stick holder.
+    state = env.get_train_tasks()[0].init.copy()
+    holder, = state.get_objects(holder_type)
+    robot, = state.get_objects(robot_type)
+    stick, = state.get_objects(stick_type)
+    x = (env.rz_x_ub + env.rz_x_lb) / 2
+    y = (env.rz_y_ub + env.rz_y_lb) / 2
+    state.set(robot, "x", x)
+    state.set(robot, "y", y)
+    state.set(stick, "x", x)
+    state.set(stick, "y", y)
+    state.set(holder, "x", x - (env.holder_height - env.stick_height) / 2)
+    state.set(holder, "y", y)
+    # Press to pick up the stick.
+    action = Action(np.array([0.0, 0.0, 0.0, -1.0, 1.0], dtype=np.float32))
+    next_state = env.simulate(state, action)
+    assert state.allclose(next_state)  # should noop
diff --git a/tests/envs/test_vlm_envs.py b/tests/envs/test_vlm_envs.py
new file mode 100644
index 0000000000..5f3ad4f973
--- /dev/null
+++ b/tests/envs/test_vlm_envs.py
@@ -0,0 +1,33 @@
+"""Tests for VLM predicate invention environments."""
+
+import numpy as np
+import pytest
+
+from predicators import utils
+from predicators.envs.vlm_envs import IceTeaMakingEnv
+from predicators.structs import Action, DefaultState, Object, State
+
+
+def test_ice_tea_making():
+    """Tests for the Iced Tea Making environment."""
+    utils.reset_config({"num_train_tasks": 5, "num_test_tasks": 5})
+    env = IceTeaMakingEnv()
+    assert env.get_name() == "ice_tea_making"
+    assert len(env.types) == 7
+    assert len(env.predicates) == 1
+    assert len(env.goal_predicates) == 1
+    assert len(env.get_train_tasks()) == 5
+    assert len(env.get_test_tasks()) == 5
+    assert env.action_space.shape == (0, )
+    with pytest.raises(ValueError):
+        env.simulate(DefaultState, Action(np.zeros(0)))
+    with pytest.raises(ValueError):
+        env.render_state_plt(DefaultState, None, Action(np.zeros(0)))
+    t_list = sorted(list(env.types))
+    goal_type_list = [t for t in t_list if t.name == 'goal_object']
+    goal_type = goal_type_list[0]
+    goal_obj = Object("goal", goal_type)
+    init_state = State({goal_obj: np.array([0.0])})
+    goal_preds_list = list(env.goal_predicates)
+    goal_pred = goal_preds_list[0]
+    assert not goal_pred.holds(init_state, [goal_obj])
diff --git a/tests/test_llm_interface.py b/tests/test_pretrained_model_interface.py
similarity index 52%
rename from tests/test_llm_interface.py
rename to tests/test_pretrained_model_interface.py
index 45d7344d1b..13056ddacd 100644
--- a/tests/test_llm_interface.py
+++ b/tests/test_pretrained_model_interface.py
@@ -4,9 +4,11 @@
 import shutil
 
 import pytest
+from PIL import Image
 
 from predicators import utils
-from predicators.llm_interface import LargeLanguageModel, OpenAILLM
+from predicators.pretrained_model_interface import GoogleGeminiVLM, \
+    LargeLanguageModel, OpenAILLM, VisionLanguageModel
 
 
 class _DummyLLM(LargeLanguageModel):
@@ -16,10 +18,33 @@ def get_id(self):
 
     def _sample_completions(self,
                             prompt,
+                            imgs,
                             temperature,
                             seed,
                             stop_token=None,
                             num_completions=1):
+        del imgs  # unused.
+        completions = []
+        for _ in range(num_completions):
+            completion = (f"Prompt: {prompt}. Seed: {seed}. "
+                          f"Temp: {temperature:.1f}. Stop: {stop_token}.")
+            completions.append(completion)
+        return completions
+
+
+class _DummyVLM(VisionLanguageModel):
+
+    def get_id(self):
+        return "dummy"
+
+    def _sample_completions(self,
+                            prompt,
+                            imgs,
+                            temperature,
+                            seed,
+                            stop_token=None,
+                            num_completions=1):
+        del imgs  # unused.
         completions = []
         for _ in range(num_completions):
             completion = (f"Prompt: {prompt}. Seed: {seed}. "
@@ -31,13 +56,14 @@ def _sample_completions(self,
 def test_large_language_model():
     """Tests for LargeLanguageModel()."""
     cache_dir = "_fake_llm_cache_dir"
-    utils.reset_config({"llm_prompt_cache_dir": cache_dir})
+    utils.reset_config({"pretrained_model_prompt_cache_dir": cache_dir})
     # Remove the fake cache dir in case it's lying around from old tests.
     shutil.rmtree(cache_dir, ignore_errors=True)
     # Query a dummy LLM.
     llm = _DummyLLM()
     assert llm.get_id() == "dummy"
     completions = llm.sample_completions("Hello!",
+                                         None,
                                          0.5,
                                          123,
                                          stop_token="#stop",
@@ -46,13 +72,18 @@ def test_large_language_model():
     assert completions == [expected_completion] * 3
     # Query it again, covering the case where we load from disk.
     completions = llm.sample_completions("Hello!",
+                                         None,
                                          0.5,
                                          123,
                                          stop_token="#stop",
                                          num_completions=3)
     assert completions == [expected_completion] * 3
     # Query with temperature 0.
-    completions = llm.sample_completions("Hello!", 0.0, 123, num_completions=3)
+    completions = llm.sample_completions("Hello!",
+                                         None,
+                                         0.0,
+                                         123,
+                                         num_completions=3)
     expected_completion = "Prompt: Hello!. Seed: 123. Temp: 0.0. Stop: None."
     assert completions == [expected_completion] * 3
     # Clean up the cache dir.
@@ -61,16 +92,38 @@ def test_large_language_model():
     utils.update_config({"llm_use_cache_only": True})
     with pytest.raises(ValueError) as e:
         completions = llm.sample_completions("Hello!",
+                                             None,
                                              0.5,
                                              123,
                                              num_completions=3)
-    assert "No cached response found for LLM prompt." in str(e)
+    assert "No cached response found for prompt." in str(e)
+
+
+def test_vision_language_model():
+    """Tests for LargeLanguageModel()."""
+    cache_dir = "_fake_vlm_cache_dir"
+    utils.reset_config({"pretrained_model_prompt_cache_dir": cache_dir})
+    # Remove the fake cache dir in case it's lying around from old tests.
+    shutil.rmtree(cache_dir, ignore_errors=True)
+    # Query a dummy LLM.
+    vlm = _DummyVLM()
+    assert vlm.get_id() == "dummy"
+    dummy_img = Image.new('RGB', (100, 100))
+    completions = vlm.sample_completions("Hello!", [dummy_img],
+                                         0.5,
+                                         123,
+                                         stop_token="#stop",
+                                         num_completions=1)
+    expected_completion = "Prompt: Hello!. Seed: 123. Temp: 0.5. Stop: #stop."
+    assert completions == [expected_completion] * 1
+    # Clean up the cache dir.
+    shutil.rmtree(cache_dir)
 
 
 def test_openai_llm():
     """Tests for OpenAILLM()."""
     cache_dir = "_fake_llm_cache_dir"
-    utils.reset_config({"llm_prompt_cache_dir": cache_dir})
+    utils.reset_config({"pretrained_model_prompt_cache_dir": cache_dir})
     if "OPENAI_API_KEY" not in os.environ:  # pragma: no cover
         os.environ["OPENAI_API_KEY"] = "dummy API key"
     # Create an OpenAILLM with the curie model.
@@ -83,3 +136,14 @@ def test_openai_llm():
     # completions2 = llm.sample_completions("Hi", 0.5, 123, num_completions=2)
     # assert completions == completions2
     # shutil.rmtree(cache_dir)
+
+
+def test_gemini_vlm():
+    """Tests for GoogleGeminiVLM()."""
+    cache_dir = "_fake_llm_cache_dir"
+    utils.reset_config({"pretrained_model_prompt_cache_dir": cache_dir})
+    if "GOOGLE_API_KEY" not in os.environ:  # pragma: no cover
+        os.environ["GOOGLE_API_KEY"] = "dummy API key"
+    # Create an OpenAILLM with the curie model.
+    vlm = GoogleGeminiVLM("gemini-pro-vision")
+    assert vlm.get_id() == "Google-gemini-pro-vision"