From 17249c636dfbe89166f85aceaa78c542a5b6efa7 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Tue, 10 Dec 2024 13:15:46 -0800
Subject: [PATCH 01/21] set use_semantic_sensor=False everywhere

---
 .../frameworks/config_utils/make_dataset_configs.py  | 12 ++++++------
 .../monty/frameworks/environment_utils/transforms.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
index 215e5ff5..20059956 100644
--- a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
+++ b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
@@ -246,7 +246,7 @@ def __post_init__(self):
                 world_coord=True,
                 zooms=agent_args["zooms"],
                 get_all_points=True,
-                use_semantic_sensor=True,
+                use_semantic_sensor=False,
             ),
         ]
 
@@ -276,7 +276,7 @@ def __post_init__(self):
                 world_coord=True,
                 zooms=agent_args["zooms"],
                 get_all_points=True,
-                use_semantic_sensor=True,
+                use_semantic_sensor=False,
             ),
         ]
 
@@ -296,7 +296,7 @@ def __post_init__(self):
                 world_coord=True,
                 zooms=1,
                 get_all_points=True,
-                use_semantic_sensor=True,
+                use_semantic_sensor=False,
                 depth_clip_sensors=(0,),
                 clip_value=1.1,
             ),
@@ -380,7 +380,7 @@ def __post_init__(self):
                 world_coord=True,
                 zooms=agent_args["zooms"],
                 get_all_points=True,
-                use_semantic_sensor=True,
+                use_semantic_sensor=False,
                 depth_clip_sensors=(0,),  # comma needed to make it a tuple
                 clip_value=0.05,
             ),
@@ -421,7 +421,7 @@ def __post_init__(self):
                 world_coord=True,
                 zooms=agent_args["zooms"],
                 get_all_points=True,
-                use_semantic_sensor=True,
+                use_semantic_sensor=False,
                 depth_clip_sensors=(0,),  # comma needed to make it a tuple
                 clip_value=0.05,
             ),
@@ -918,7 +918,7 @@ def __post_init__(self):
                 world_coord=True,
                 zooms=agent_args["zooms"],
                 get_all_points=True,
-                use_semantic_sensor=True,
+                use_semantic_sensor=False,
             ),
         ]
 
diff --git a/src/tbp/monty/frameworks/environment_utils/transforms.py b/src/tbp/monty/frameworks/environment_utils/transforms.py
index 49fbfc23..ba442328 100644
--- a/src/tbp/monty/frameworks/environment_utils/transforms.py
+++ b/src/tbp/monty/frameworks/environment_utils/transforms.py
@@ -258,7 +258,7 @@ def __init__(
         depth_clip_sensors=(),
         world_coord=True,
         get_all_points=False,
-        use_semantic_sensor=True,
+        use_semantic_sensor=False,
     ):
         self.needs_rng = False
 

From a4392b329acea600e869e740aab9d4e3f1b73264 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 16 Dec 2024 16:43:49 -0800
Subject: [PATCH 02/21] Update make_dataset_configs.py

---
 src/tbp/monty/frameworks/config_utils/make_dataset_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
index 64f9a96e..5cc13eb4 100644
--- a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
+++ b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
@@ -1328,7 +1328,7 @@ def make_multi_sensor_mount_config(
 
     # sensor rotations
     if rotations is None:
-        rotations = np.zeros([arr_len, 4])
+        rotations = np.ones([arr_len, 4])
         rotations[:, 0] = 1.0
     else:
         rotations = np.asarray(rotations)

From a00619475a482c624f191281df6b5b172bda1fe3 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 16 Dec 2024 16:46:27 -0800
Subject: [PATCH 03/21] Undo accidental rotation change

---
 src/tbp/monty/frameworks/config_utils/make_dataset_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
index 5cc13eb4..64f9a96e 100644
--- a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
+++ b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
@@ -1328,7 +1328,7 @@ def make_multi_sensor_mount_config(
 
     # sensor rotations
     if rotations is None:
-        rotations = np.ones([arr_len, 4])
+        rotations = np.zeros([arr_len, 4])
         rotations[:, 0] = 1.0
     else:
         rotations = np.asarray(rotations)

From e6c50fd4a7383a305c26fbeb543ff2a5d543f411 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 16 Dec 2024 17:09:07 -0800
Subject: [PATCH 04/21] Change how semantic maps are obtained

Also fix spelling error. `get_perc_on_obj_semantic` had the parameter "sematic" [sic]. I changed it to "semantic" and updated uses.
---
 .../monty/frameworks/models/motor_policies.py | 44 +++++++------------
 tests/unit/policy_test.py                     |  4 +-
 2 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/src/tbp/monty/frameworks/models/motor_policies.py b/src/tbp/monty/frameworks/models/motor_policies.py
index b26fcdb7..eee3c3ae 100644
--- a/src/tbp/monty/frameworks/models/motor_policies.py
+++ b/src/tbp/monty/frameworks/models/motor_policies.py
@@ -574,31 +574,21 @@ def move_close_enough(
         Raises:
             ValueError: If the object is not visible
         """
-        view = raw_observation[self.agent_id][view_sensor_id]["semantic"]
-        points_on_target_obj = (
-            raw_observation[self.agent_id][view_sensor_id]["semantic"]
-            == target_semantic_id
-        )
+        # Reconstruct 2D semantic map.
+        depth_image = raw_observation[self.agent_id][view_sensor_id]["depth"]
+        semantic_3d = raw_observation[self.agent_id][view_sensor_id]["semantic_3d"]
+        semantic_image = semantic_3d[:, 3].reshape(depth_image.shape).astype(int)
 
+        points_on_target_obj = semantic_image == target_semantic_id
+        n_points_on_target_obj = points_on_target_obj.sum()
         # For multi-object experiments, handle the possibility that object is no
         # longer visible
-        if multi_objects_present and (
-            len(
-                raw_observation[self.agent_id][view_sensor_id]["depth"][
-                    points_on_target_obj
-                ]
-            )
-            == 0
-        ):
+        if multi_objects_present and n_points_on_target_obj == 0:
             logging.debug("Object not visible, cannot move closer")
             return None, True
 
-        if len(points_on_target_obj) > 0:
-            closest_point_on_target_obj = np.min(
-                raw_observation[self.agent_id][view_sensor_id]["depth"][
-                    points_on_target_obj
-                ]
-            )
+        if n_points_on_target_obj > 0:
+            closest_point_on_target_obj = np.min(depth_image[points_on_target_obj])
             logging.debug(
                 "closest target object point: " + str(closest_point_on_target_obj)
             )
@@ -608,18 +598,14 @@ def move_close_enough(
             )
 
         perc_on_target_obj = get_perc_on_obj_semantic(
-            view, sematic_id=target_semantic_id
+            semantic_image, semantic_id=target_semantic_id
         )
         logging.debug("% on target object: " + str(perc_on_target_obj))
 
         # Also calculate closest point on *any* object so that we don't get too close
         # and clip into objects; NB that any object will have a semantic ID > 0
-        points_on_any_obj = (
-            raw_observation[self.agent_id][view_sensor_id]["semantic"] > 0
-        )
-        closest_point_on_any_obj = np.min(
-            raw_observation[self.agent_id][view_sensor_id]["depth"][points_on_any_obj]
-        )
+        points_on_any_obj = semantic_image > 0
+        closest_point_on_any_obj = np.min(depth_image[points_on_any_obj])
         logging.debug("closest point on any object: " + str(closest_point_on_any_obj))
 
         if perc_on_target_obj < self.good_view_percentage:
@@ -1313,7 +1299,7 @@ def get_perc_on_obj(rgba_obs):
     return per_on_obj
 
 
-def get_perc_on_obj_semantic(semantic_obs, sematic_id=0):
+def get_perc_on_obj_semantic(semantic_obs, semantic_id=0):
     """Get the percentage of pixels in the observation that land on the target object.
 
     If a semantic ID is provided, then only pixels on the target object are counted;
@@ -1330,11 +1316,11 @@ def get_perc_on_obj_semantic(semantic_obs, sematic_id=0):
         perc_on_obj: Percentage of pixels on the object.
     """
     res = semantic_obs.shape[0] * semantic_obs.shape[1]
-    if sematic_id == 0:
+    if semantic_id == 0:
         csum = np.sum(semantic_obs >= 1)
     else:
         # Count only pixels on the target (e.g. primary target) object
-        csum = np.sum(semantic_obs == sematic_id)
+        csum = np.sum(semantic_obs == semantic_id)
     per_on_obj = csum / res
     return per_on_obj
 
diff --git a/tests/unit/policy_test.py b/tests/unit/policy_test.py
index 14a25888..b2e69b10 100644
--- a/tests/unit/policy_test.py
+++ b/tests/unit/policy_test.py
@@ -598,7 +598,7 @@ def test_get_good_view_basic_dist_agent(self):
         observation = next(self.exp.dataloader)
         # TODO M remove the following train-wreck during refactor
         view = observation[self.exp.model.motor_system.agent_id]["view_finder"]
-        perc_on_target_obj = get_perc_on_obj_semantic(view["semantic"], sematic_id=1)
+        perc_on_target_obj = get_perc_on_obj_semantic(view["semantic"], semantic_id=1)
 
         dict_config = config_to_dict(config)
 
@@ -707,7 +707,7 @@ def test_get_good_view_multi_object(self):
         observation = next(self.exp.dataloader)
         # TODO M remove the following train-wreck during refactor
         view = observation[self.exp.model.motor_system.agent_id]["view_finder"]
-        perc_on_target_obj = get_perc_on_obj_semantic(view["semantic"], sematic_id=1)
+        perc_on_target_obj = get_perc_on_obj_semantic(view["semantic"], semantic_id=1)
 
         dict_config = config_to_dict(config)
         target_perc_on_target_obj = dict_config["monty_config"]["motor_system_config"][

From dad74bdf0f94b1f2c15aa1219a76fd72007b31bf Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 16 Dec 2024 17:10:04 -0800
Subject: [PATCH 05/21] Update make_dataset_configs.py

Change default "semantics" to False in Multi-LM mount configs.
---
 src/tbp/monty/frameworks/config_utils/make_dataset_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
index 64f9a96e..25f98b4a 100644
--- a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
+++ b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
@@ -1340,7 +1340,7 @@ def make_multi_sensor_mount_config(
 
     # sensor semantics
     if semantics is None:
-        semantics = np.ones(arr_len, dtype=bool)
+        semantics = np.zeros(arr_len, dtype=bool)
     else:
         semantics = np.asarray(semantics, dtype=bool)
     assert semantics.shape == (arr_len,), f"`semantics` must have shape ({arr_len},)"

From bfcb38dc0cad1b09167f69041370bb7cd03fd784 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 16 Dec 2024 20:53:59 -0800
Subject: [PATCH 06/21] Use estimated semantic map for single-object
 experiments

---
 .../config_utils/make_dataset_configs.py      |  2 +-
 .../frameworks/environments/embodied_data.py  |  2 ++
 .../monty/frameworks/models/motor_policies.py | 29 +++++++++++++++----
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
index 25f98b4a..64f9a96e 100644
--- a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
+++ b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
@@ -1340,7 +1340,7 @@ def make_multi_sensor_mount_config(
 
     # sensor semantics
     if semantics is None:
-        semantics = np.zeros(arr_len, dtype=bool)
+        semantics = np.ones(arr_len, dtype=bool)
     else:
         semantics = np.asarray(semantics, dtype=bool)
     assert semantics.shape == (arr_len,), f"`semantics` must have shape ({arr_len},)"
diff --git a/src/tbp/monty/frameworks/environments/embodied_data.py b/src/tbp/monty/frameworks/environments/embodied_data.py
index 3722b674..1eb0d469 100644
--- a/src/tbp/monty/frameworks/environments/embodied_data.py
+++ b/src/tbp/monty/frameworks/environments/embodied_data.py
@@ -542,6 +542,7 @@ def get_good_view(self, view_sensor_id):
                 self._observation,
                 view_sensor_id,
                 target_semantic_id=self.primary_target["semantic_id"],
+                multi_objects_present=self.num_distactors > 0,
             )
             if not on_object:
                 for action in actions:
@@ -572,6 +573,7 @@ def get_good_view(self, view_sensor_id):
             self._observation,
             view_sensor_id,
             target_semantic_id=self.primary_target["semantic_id"],
+            multi_objects_present=self.num_distactors > 0,
         )
         if not on_object:
             for action in actions:
diff --git a/src/tbp/monty/frameworks/models/motor_policies.py b/src/tbp/monty/frameworks/models/motor_policies.py
index eee3c3ae..45a0d37c 100644
--- a/src/tbp/monty/frameworks/models/motor_policies.py
+++ b/src/tbp/monty/frameworks/models/motor_policies.py
@@ -13,7 +13,7 @@
 import logging
 import math
 import os
-from typing import Any, Callable, Dict, List, Tuple, Type, Union, cast
+from typing import Any, Callable, Dict, List, Mapping, Tuple, Type, Union, cast
 
 import numpy as np
 import quaternion as qt
@@ -551,7 +551,11 @@ def convert_motor_state(self):
     ###
 
     def move_close_enough(
-        self, raw_observation, view_sensor_id, target_semantic_id, multi_objects_present
+        self,
+        raw_observation: Mapping,
+        view_sensor_id: str,
+        target_semantic_id: int,
+        multi_objects_present: bool,
     ) -> Tuple[Union[Action, None], bool]:
         """At beginning of episode move close enough to the object.
 
@@ -579,10 +583,14 @@ def move_close_enough(
         semantic_3d = raw_observation[self.agent_id][view_sensor_id]["semantic_3d"]
         semantic_image = semantic_3d[:, 3].reshape(depth_image.shape).astype(int)
 
+        if not multi_objects_present:
+            semantic_image[semantic_image > 0] = target_semantic_id
+
         points_on_target_obj = semantic_image == target_semantic_id
         n_points_on_target_obj = points_on_target_obj.sum()
+
         # For multi-object experiments, handle the possibility that object is no
-        # longer visible
+        # longer visible.
         if multi_objects_present and n_points_on_target_obj == 0:
             logging.debug("Object not visible, cannot move closer")
             return None, True
@@ -628,7 +636,11 @@ def move_close_enough(
             return None, True  # done
 
     def orient_to_object(
-        self, raw_observation, view_sensor_id, target_semantic_id
+        self,
+        raw_observation: Mapping,
+        view_sensor_id: str,
+        target_semantic_id: int,
+        multi_objects_present: bool,
     ) -> Tuple[List[Action], bool]:
         """Rotate sensors so that they are centered on the object using a view finder.
 
@@ -644,9 +656,14 @@ def orient_to_object(
         Returns:
             Two actions to execute to put the patch on the object
         """
-        sem_obs = raw_observation[self.agent_id][view_sensor_id]["semantic"]
+        # Reconstruct 2D semantic map.
+        depth_image = raw_observation[self.agent_id][view_sensor_id]["depth"]
+        obs_dim = depth_image.shape[0:2]
         sem3d_obs = raw_observation[self.agent_id][view_sensor_id]["semantic_3d"]
-        obs_dim = sem_obs.shape
+        sem_obs = sem3d_obs[:, 3].reshape(obs_dim).astype(int)
+
+        if not multi_objects_present:
+            sem_obs[sem_obs > 0] = target_semantic_id
 
         logging.debug("Searching for object")
 

From a5452f570f3009ad302592923eb6ba6ad68a3b3c Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Tue, 17 Dec 2024 16:47:20 -0800
Subject: [PATCH 07/21] Update motor_policies.py

Update docstring
---
 src/tbp/monty/frameworks/models/motor_policies.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tbp/monty/frameworks/models/motor_policies.py b/src/tbp/monty/frameworks/models/motor_policies.py
index 45a0d37c..acc289f6 100644
--- a/src/tbp/monty/frameworks/models/motor_policies.py
+++ b/src/tbp/monty/frameworks/models/motor_policies.py
@@ -652,6 +652,8 @@ def orient_to_object(
             view_sensor_id: view finder id (str)
             target_semantic_id: the integer corresponding to the semantic ID
                 of the target object that we will try to fixate on
+            multi_objects_present: whether there are multiple objects present in the
+                scene.
 
         Returns:
             Two actions to execute to put the patch on the object

From e9b129613088b8a8fac46f86ae2346487919a921 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Fri, 20 Dec 2024 19:16:09 -0500
Subject: [PATCH 08/21] Add multi-obj dataset args, default semantics to False

---
 benchmarks/configs/ycb_experiments.py         |  2 +
 .../config_utils/make_dataset_configs.py      | 61 +++++++++++++++----
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/benchmarks/configs/ycb_experiments.py b/benchmarks/configs/ycb_experiments.py
index f37f7293..4988b1d9 100644
--- a/benchmarks/configs/ycb_experiments.py
+++ b/benchmarks/configs/ycb_experiments.py
@@ -32,6 +32,7 @@
     FiveLMMountHabitatDatasetArgs,
     NoisySurfaceViewFinderMountHabitatDatasetArgs,
     PatchViewFinderMountHabitatDatasetArgs,
+    PatchViewFinderMultiObjectMountHabitatDatasetArgs,
     PredefinedObjectInitializer,
     RandomRotationObjectInitializer,
     SurfaceViewFinderMountHabitatDatasetArgs,
@@ -537,6 +538,7 @@
         learning_module_configs=lower_max_nneighbors_1lm_config,
         monty_args=MontyArgs(min_eval_steps=min_eval_steps),
     ),
+    dataset_args=PatchViewFinderMultiObjectMountHabitatDatasetArgs(),
     eval_dataloader_args=EnvironmentDataloaderMultiObjectArgs(
         object_names=dict(
             targets_list=get_object_names_by_idx(0, 10, object_list=DISTINCT_OBJECTS),
diff --git a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
index 64f9a96e..bf0e8c35 100644
--- a/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
+++ b/src/tbp/monty/frameworks/config_utils/make_dataset_configs.py
@@ -861,7 +861,7 @@ class PatchAndViewFinderMountConfig:
         default_factory=lambda: [[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]
     )
     semantics: List[List[Union[int, float]]] = field(
-        default_factory=lambda: [True, True]
+        default_factory=lambda: [False, False]
     )
     zooms: List[float] = field(default_factory=lambda: [10.0, 1.0])
 
@@ -1009,7 +1009,7 @@ class MultiLMMountConfig:
         ]
     )
     semantics: List[List[Union[int, float]]] = field(
-        default_factory=lambda: [True, True, True]
+        default_factory=lambda: [False, False, False]
     )
     zooms: List[float] = field(default_factory=lambda: [10.0, 10.0, 1.0])
 
@@ -1042,7 +1042,7 @@ class TwoLMStackedDistantMountConfig:
         ]
     )
     semantics: List[List[Union[int, float]]] = field(
-        default_factory=lambda: [True, True, True]
+        default_factory=lambda: [False, False, False]
     )
     zooms: List[float] = field(default_factory=lambda: [10.0, 5.0, 1.0])
 
@@ -1099,13 +1099,56 @@ class FiveLMMountConfig:
         ]
     )
     semantics: List[List[Union[int, float]]] = field(
-        default_factory=lambda: [True, True, True, True, True, True]
+        default_factory=lambda: [False, False, False, False, False, False]
     )
     zooms: List[float] = field(
         default_factory=lambda: [10.0, 10.0, 10.0, 10.0, 10.0, 1.0]
     )
 
 
+@dataclass
+class PatchAndViewFinderMultiObjectMountConfig(PatchAndViewFinderMountConfig):
+    semantics: List[List[Union[int, float]]] = field(
+        default_factory=lambda: [True, True]
+    )
+
+
+@dataclass
+class EnvInitArgsPatchViewFinderMultiObjectMount(EnvInitArgs):
+    agents: List[AgentConfig] = field(
+        default_factory=lambda: [
+            AgentConfig(
+                MultiSensorAgent, PatchAndViewFinderMultiObjectMountConfig().__dict__
+            )
+        ]
+    )
+
+
+@dataclass
+class PatchViewFinderMultiObjectMountHabitatDatasetArgs:
+    env_init_func: Callable = field(default=HabitatEnvironment)
+    env_init_args: Dict = field(
+        default_factory=lambda: EnvInitArgsPatchViewFinderMultiObjectMount().__dict__
+    )
+    transform: Union[Callable, list, None] = None
+    rng: Union[Callable, None] = None
+
+    def __post_init__(self):
+        agent_args = self.env_init_args["agents"][0].agent_args
+        self.transform = [
+            MissingToMaxDepth(agent_id=agent_args["agent_id"], max_depth=1),
+            DepthTo3DLocations(
+                agent_id=agent_args["agent_id"],
+                sensor_ids=agent_args["sensor_ids"],
+                resolutions=agent_args["resolutions"],
+                world_coord=True,
+                zooms=agent_args["zooms"],
+                get_all_points=True,
+                use_semantic_sensor=True,
+            ),
+        ]
+
+
 """
 Utilities for generating multi-LM dataset args.
 """
@@ -1269,10 +1312,7 @@ def make_multi_sensor_mount_config(
         positions: Positions of the sensors. If not provided, calls
             `make_sensor_positions_on_grid` with its default arguments.
         rotations: Rotations of the sensors. Defaults to [1, 0, 0, 0] for all sensors.
-        semantics: Defaults to `False` for all sensors except for the last entry
-            which is set to `True`. This is because Monty currently requires the
-            view finder to create semantic maps. If given, `semantics` must also
-            have `semantics[-1]` set to `True`.
+        semantics: Defaults to `False` for all sensors.
         zooms: Zooms of the sensors. Defaults to 10.0 for all sensors except for the
           except for the view finder (which has a zoom of 1.0)
 
@@ -1340,13 +1380,10 @@ def make_multi_sensor_mount_config(
 
     # sensor semantics
     if semantics is None:
-        semantics = np.ones(arr_len, dtype=bool)
+        semantics = np.zeros(arr_len, dtype=bool)
     else:
         semantics = np.asarray(semantics, dtype=bool)
     assert semantics.shape == (arr_len,), f"`semantics` must have shape ({arr_len},)"
-    # TODO: Support `False` values. They currently cause errors.
-    # Also make sure numpy.bool is OK here. May possibly need to use built-in booleans
-    # (so possibly make dtype 'object', but I think it's fine).
     mount_config["semantics"] = semantics
 
     # sensor zooms

From 7c92374759b3b9501682025170ad79ae10bcee54 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Sat, 21 Dec 2024 18:02:11 -0500
Subject: [PATCH 09/21] Update policy_test.py

---
 tests/unit/policy_test.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/unit/policy_test.py b/tests/unit/policy_test.py
index b2e69b10..c58c2d4f 100644
--- a/tests/unit/policy_test.py
+++ b/tests/unit/policy_test.py
@@ -53,6 +53,7 @@
     ExperimentArgs,
     FiveLMMountHabitatDatasetArgs,
     PatchViewFinderMountHabitatDatasetArgs,
+    PatchViewFinderMultiObjectMountHabitatDatasetArgs,
     PredefinedObjectInitializer,
     SurfaceViewFinderMountHabitatDatasetArgs,
 )
@@ -313,6 +314,7 @@ def setUp(self):
         self.poor_initial_view_multi_object_config.update(
             # For multi-objects, we test get good view at evaluation, because in
             # Monty we don't currently train with multiple objects in the environment
+            dataset_args=PatchViewFinderMultiObjectMountHabitatDatasetArgs(),
             eval_dataloader_args=EnvironmentDataloaderMultiObjectArgs(
                 object_names=dict(
                     targets_list=["cubeSolid"],
@@ -598,7 +600,8 @@ def test_get_good_view_basic_dist_agent(self):
         observation = next(self.exp.dataloader)
         # TODO M remove the following train-wreck during refactor
         view = observation[self.exp.model.motor_system.agent_id]["view_finder"]
-        perc_on_target_obj = get_perc_on_obj_semantic(view["semantic"], semantic_id=1)
+        semantic = view["semantic_3d"][:, 3].reshape(view["depth"].shape)
+        perc_on_target_obj = get_perc_on_obj_semantic(semantic, semantic_id=1)
 
         dict_config = config_to_dict(config)
 
@@ -611,7 +614,7 @@ def test_get_good_view_basic_dist_agent(self):
         ), f"Initial view is not good enough, {perc_on_target_obj}\
             vs target of {target_perc_on_target_obj}"
 
-        points_on_target_obj = view["semantic"] == 1
+        points_on_target_obj = semantic == 1
         closest_point_on_target_obj = np.min(view["depth"][points_on_target_obj])
 
         target_closest_point = dict_config["monty_config"]["motor_system_config"][
@@ -658,7 +661,10 @@ def test_touch_object_basic_surf_agent(self):
         ]
         dict_config = config_to_dict(config)
 
-        points_on_target_obj = view["semantic"] == 1
+        # points_on_target_obj = view["semantic"] == 1
+        points_on_target_obj = (
+            view["semantic_3d"][:, 3].reshape(view["depth"].shape) == 1
+        )
         closest_point_on_target_obj = np.min(view["depth"][points_on_target_obj])
 
         assert (
@@ -707,7 +713,8 @@ def test_get_good_view_multi_object(self):
         observation = next(self.exp.dataloader)
         # TODO M remove the following train-wreck during refactor
         view = observation[self.exp.model.motor_system.agent_id]["view_finder"]
-        perc_on_target_obj = get_perc_on_obj_semantic(view["semantic"], semantic_id=1)
+        semantic = view["semantic_3d"][:, 3].reshape(view["depth"].shape)
+        perc_on_target_obj = get_perc_on_obj_semantic(semantic, semantic_id=1)
 
         dict_config = config_to_dict(config)
         target_perc_on_target_obj = dict_config["monty_config"]["motor_system_config"][
@@ -719,7 +726,7 @@ def test_get_good_view_multi_object(self):
         ), f"Initial view is not good enough, {perc_on_target_obj}\
             vs target of {target_perc_on_target_obj}"
 
-        points_on_target_obj = view["semantic"] == 1
+        points_on_target_obj = semantic == 1
         closest_point_on_target_obj = np.min(view["depth"][points_on_target_obj])
 
         target_closest_point = dict_config["monty_config"]["motor_system_config"][

From a33d3d82268d576a67122aac590034982e3facdb Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 23 Dec 2024 16:10:54 -0500
Subject: [PATCH 10/21] Fix for surf agent w/o semantic sensor

---
 benchmarks/sandbox.py                         | 46 ++++++++++++++++
 .../environment_utils/transforms.py           | 52 +++++++++++++++----
 2 files changed, 88 insertions(+), 10 deletions(-)
 create mode 100644 benchmarks/sandbox.py

diff --git a/benchmarks/sandbox.py b/benchmarks/sandbox.py
new file mode 100644
index 00000000..5623e69e
--- /dev/null
+++ b/benchmarks/sandbox.py
@@ -0,0 +1,46 @@
+from tbp.monty.frameworks.run_env import setup_env
+
+setup_env()
+from configs import CONFIGS  # noqa: E402
+
+from tbp.monty.frameworks.run import main  # noqa: E402
+
+main(all_configs=CONFIGS, experiments=["base_config_10distinctobj_surf_agent"])
+
+# import matplotlib.pyplot as plt
+# import numpy as np
+
+# from tbp.monty.frameworks.environment_utils.transforms import DepthTo3DLocations
+
+# t = DepthTo3DLocations(
+#     agent_id="agent_id_0",
+#     sensor_ids=["patch", "view_finder"],
+#     resolutions=[[64, 64], [64, 64]],
+#     zooms=[1.0, 10.0],
+#     get_all_points=True,
+# )
+
+# depth = np.ones((64, 64))
+# depth[32, 32] = 0
+
+# observations = {
+#     "agent_id_0": {
+#         "patch": {
+#             "depth": np.random.rand(64, 64),
+#             "rgba": np.random.rand(64, 64, 4),
+#         },
+#         "view_finder": {
+#             "depth": depth,
+#             "rgba": np.random.rand(64, 64, 4),
+#         },
+#     },
+# }
+
+# obs = t(observations)
+# depth = obs["agent_id_0"]["view_finder"]["depth"]
+# sem = obs["agent_id_0"]["view_finder"]["semantic_3d"][:, 3].reshape(depth.shape)
+
+
+# plt.imshow(sem)
+# plt.colorbar()
+# plt.show()
diff --git a/src/tbp/monty/frameworks/environment_utils/transforms.py b/src/tbp/monty/frameworks/environment_utils/transforms.py
index ba442328..e7944459 100644
--- a/src/tbp/monty/frameworks/environment_utils/transforms.py
+++ b/src/tbp/monty/frameworks/environment_utils/transforms.py
@@ -7,6 +7,9 @@
 # license that can be found in the LICENSE file or at
 # https://opensource.org/licenses/MIT.
 
+from numbers import Number
+from typing import Tuple
+
 import numpy as np
 import quaternion as qt
 import scipy
@@ -309,6 +312,9 @@ def __call__(self, observations, state=None):
             agent_obs = observations[self.agent_id][sensor_id]
             if i in self.depth_clip_sensors:
                 self.clip(agent_obs)
+                default_on_surface_th = self.clip_value
+            else:
+                default_on_surface_th = 1000
             depth_obs = agent_obs["depth"]
             # if applying depth clip, then do not use depth for semantic info
             # because the depth surface now includes a sheet of pixels all
@@ -317,11 +323,15 @@ def __call__(self, observations, state=None):
             if self.depth_clip_sensors and self.use_semantic_sensor:
                 semantic_obs = agent_obs["semantic"]
             elif self.use_semantic_sensor:
-                surface_obs = self.get_semantic_from_depth(depth_obs.copy())
+                surface_obs = self.get_semantic_from_depth(
+                    depth_obs, default_on_surface_th
+                )
                 # set pixel to 1 if it is on the main surface and on the object
                 semantic_obs = agent_obs["semantic"] * surface_obs
             else:
-                semantic_obs = self.get_semantic_from_depth(depth_obs.copy())
+                semantic_obs = self.get_semantic_from_depth(
+                    depth_obs, default_on_surface_th
+                )
 
             # Approximate true world coordinates
             x, y = np.meshgrid(
@@ -395,11 +405,16 @@ def clip(self, agent_obs):
         Set the values of 0 (infinite depth) to the clip value.
         """
         if "semantic" in agent_obs.keys():
-            agent_obs["semantic"][agent_obs["depth"] > self.clip_value] = 0
+            agent_obs["semantic"][agent_obs["depth"] >= self.clip_value] = 0
         agent_obs["depth"][agent_obs["depth"] > self.clip_value] = self.clip_value
         agent_obs["depth"][agent_obs["depth"] == 0] = self.clip_value
 
-    def get_on_surface_th(self, depth_patch, min_depth_range):
+    def get_on_surface_th(
+        self,
+        depth_patch,
+        min_depth_range: Number,
+        default_on_surface_th: Number,
+    ) -> Tuple[Number, bool]:
         """Return a depth threshold if we have a bimodal depth distribution.
 
         If the depth values are in a large enough range (> min_depth_range) we may
@@ -424,13 +439,14 @@ def get_on_surface_th(self, depth_patch, min_depth_range):
         Args:
             depth_patch: sensor patch observations of depth
             min_depth_range: minimum range of depth values to even be considered
-
+            default_on_surface_th: default threshold to use if no bimodal distribution
+                is found
         Returns:
             threshold and whether we want to use values above or below threshold
         """
-        depths = np.array(depth_patch).flatten()
+        depths = np.asarray(depth_patch).flatten()
         flip_sign = False
-        th = 1000  # just high value
+        th = default_on_surface_th
         if (max(depths) - min(depths)) > min_depth_range:
             # only check for bimodal distribution if we have a large enough
             # range in depth values
@@ -452,19 +468,35 @@ def get_on_surface_th(self, depth_patch, min_depth_range):
                         flip_sign = True
         return th, flip_sign
 
-    def get_semantic_from_depth(self, depth_patch):
+    def get_semantic_from_depth(
+        self, depth_patch: np.ndarray, default_on_surface_th: Number
+    ) -> np.ndarray:
         """Return semantic patch information from heuristics on depth patch.
 
         Args:
             depth_patch: sensor patch observations of depth
-
+            default_on_surface_th: default threshold to use if no bimodal distribution
+                is found
         Returns:
             sensor patch shaped info about whether each pixel is on surface of not
         """
         # avoid large range when seeing the table (goes up to almost 100 and then
         # just using 8 bins will not work anymore)
+        depth_patch = np.array(depth_patch)
         depth_patch[depth_patch > 1] = 1.0
-        th, flip_sign = self.get_on_surface_th(depth_patch, min_depth_range=0.01)
+
+        # If all depth values are at maximum (1.0), then we are automatically
+        # off-object.
+        if np.all(depth_patch == 1.0):
+            return np.zeros_like(depth_patch, dtype=bool)
+
+        # Compute the on-suface depth threshold (and whether we need to flip the
+        # sign), and apply it to the depth to get the semantic patch.
+        th, flip_sign = self.get_on_surface_th(
+            depth_patch,
+            min_depth_range=0.01,
+            default_on_surface_th=default_on_surface_th,
+        )
         if flip_sign is False:
             semantic_patch = depth_patch < th
         else:

From 9b08b2b1666229d5ef9459082b9a9fb486e0e634 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 23 Dec 2024 17:25:37 -0500
Subject: [PATCH 11/21] Update sandbox.py

---
 benchmarks/sandbox.py | 48 ++++++++++++-------------------------------
 1 file changed, 13 insertions(+), 35 deletions(-)

diff --git a/benchmarks/sandbox.py b/benchmarks/sandbox.py
index 5623e69e..95e7adad 100644
--- a/benchmarks/sandbox.py
+++ b/benchmarks/sandbox.py
@@ -7,40 +7,18 @@
 
 main(all_configs=CONFIGS, experiments=["base_config_10distinctobj_surf_agent"])
 
-# import matplotlib.pyplot as plt
-# import numpy as np
-
-# from tbp.monty.frameworks.environment_utils.transforms import DepthTo3DLocations
-
-# t = DepthTo3DLocations(
-#     agent_id="agent_id_0",
-#     sensor_ids=["patch", "view_finder"],
-#     resolutions=[[64, 64], [64, 64]],
-#     zooms=[1.0, 10.0],
-#     get_all_points=True,
+"""
+Unsupervised
+----------------------------------------------------------------------------------------
+"""
+
+# from tbp.monty.frameworks.utils.logging_utils import (
+#     load_stats,
+#     print_unsupervised_stats,
 # )
 
-# depth = np.ones((64, 64))
-# depth[32, 32] = 0
-
-# observations = {
-#     "agent_id_0": {
-#         "patch": {
-#             "depth": np.random.rand(64, 64),
-#             "rgba": np.random.rand(64, 64, 4),
-#         },
-#         "view_finder": {
-#             "depth": depth,
-#             "rgba": np.random.rand(64, 64, 4),
-#         },
-#     },
-# }
-
-# obs = t(observations)
-# depth = obs["agent_id_0"]["view_finder"]["depth"]
-# sem = obs["agent_id_0"]["view_finder"]["semantic_3d"][:, 3].reshape(depth.shape)
-
-
-# plt.imshow(sem)
-# plt.colorbar()
-# plt.show()
+# exp_path = "/Users/sknudstrup/tbp/results/monty/projects/monty_runs/surf_agent_unsupervised_10distinctobj"
+# stats, _, _, _ = load_stats(
+#     exp_path, load_eval=False, load_detailed=False, load_models=False
+# )
+# print_unsupervised_stats(stats, 10)

From 1fd3b371d86834873e5c16d98a3c0da4593a668c Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Tue, 24 Dec 2024 17:07:24 -0500
Subject: [PATCH 12/21] rename multi_objects_present ->
 multiple_objects_present, fix find_location_to_look_at

---
 .../frameworks/environments/embodied_data.py  | 13 ++++---
 .../monty/frameworks/models/motor_policies.py | 34 +++++++++++++------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/src/tbp/monty/frameworks/environments/embodied_data.py b/src/tbp/monty/frameworks/environments/embodied_data.py
index 3d660f97..5c536883 100644
--- a/src/tbp/monty/frameworks/environments/embodied_data.py
+++ b/src/tbp/monty/frameworks/environments/embodied_data.py
@@ -103,7 +103,6 @@ def __getitem__(self, action: Action):
         state = self.env.get_state()
         if self.transform is not None:
             observation = self.apply_transform(self.transform, observation, state)
-
         return observation, state
 
     def __len__(self):
@@ -546,13 +545,13 @@ def get_good_view(
         # TODO break up this method so that there is less code duplication
         # Start by ensuring the center of the patch is covering the primary target
         # object before we start moving forward; only done for multi-object experiments
-
-        if self.num_distactors > 0:
+        multiple_objects_present = self.num_distactors > 0
+        if multiple_objects_present:
             actions, on_object = self.motor_system.orient_to_object(
                 self._observation,
                 view_sensor_id,
                 target_semantic_id=self.primary_target["semantic_id"],
-                multi_objects_present=self.num_distactors > 0,
+                multiple_objects_present=multiple_objects_present,
             )
             if not on_object:
                 for action in actions:
@@ -564,7 +563,7 @@ def get_good_view(
                 self._observation,
                 view_sensor_id,
                 target_semantic_id=self.primary_target["semantic_id"],
-                multi_objects_present=self.num_distactors > 0,
+                multiple_objects_present=multiple_objects_present,
             )
             # Continue moving to a close distance to the object
             while not close_enough:
@@ -574,7 +573,7 @@ def get_good_view(
                     self._observation,
                     view_sensor_id,
                     target_semantic_id=self.primary_target["semantic_id"],
-                    multi_objects_present=self.num_distactors > 0,
+                    multiple_objects_present=multiple_objects_present,
                 )
 
         # Re-center ourselves (if necessary) after having moved closer
@@ -582,7 +581,7 @@ def get_good_view(
             self._observation,
             view_sensor_id,
             target_semantic_id=self.primary_target["semantic_id"],
-            multi_objects_present=self.num_distactors > 0,
+            multiple_objects_present=multiple_objects_present,
         )
         if not on_object:
             for action in actions:
diff --git a/src/tbp/monty/frameworks/models/motor_policies.py b/src/tbp/monty/frameworks/models/motor_policies.py
index b6b9ef3c..9446bf35 100644
--- a/src/tbp/monty/frameworks/models/motor_policies.py
+++ b/src/tbp/monty/frameworks/models/motor_policies.py
@@ -555,7 +555,7 @@ def move_close_enough(
         raw_observation: Mapping,
         view_sensor_id: str,
         target_semantic_id: int,
-        multi_objects_present: bool,
+        multiple_objects_present: bool,
     ) -> Tuple[Union[Action, None], bool]:
         """At beginning of episode move close enough to the object.
 
@@ -567,7 +567,7 @@ def move_close_enough(
             view_sensor_id: The ID of the view sensor
             target_semantic_id: The semantic ID of the primary target object in the
                 scene.
-            multi_objects_present: Whether there are multiple objects present in the
+            multiple_objects_present: Whether there are multiple objects present in the
                 scene. If so, we do additional checks to make sure we don't get too
                 close to these when moving forward
 
@@ -583,7 +583,7 @@ def move_close_enough(
         semantic_3d = raw_observation[self.agent_id][view_sensor_id]["semantic_3d"]
         semantic_image = semantic_3d[:, 3].reshape(depth_image.shape).astype(int)
 
-        if not multi_objects_present:
+        if not multiple_objects_present:
             semantic_image[semantic_image > 0] = target_semantic_id
 
         points_on_target_obj = semantic_image == target_semantic_id
@@ -591,7 +591,7 @@ def move_close_enough(
 
         # For multi-object experiments, handle the possibility that object is no
         # longer visible.
-        if multi_objects_present and n_points_on_target_obj == 0:
+        if multiple_objects_present and n_points_on_target_obj == 0:
             logging.debug("Object not visible, cannot move closer")
             return None, True
 
@@ -618,7 +618,7 @@ def move_close_enough(
 
         if perc_on_target_obj < self.good_view_percentage:
             if closest_point_on_target_obj > self.desired_object_distance:
-                if multi_objects_present and (
+                if multiple_objects_present and (
                     closest_point_on_any_obj < self.desired_object_distance / 4
                 ):
                     logging.debug(
@@ -640,7 +640,7 @@ def orient_to_object(
         raw_observation: Mapping,
         view_sensor_id: str,
         target_semantic_id: int,
-        multi_objects_present: bool,
+        multiple_objects_present: bool,
     ) -> Tuple[List[Action], bool]:
         """Rotate sensors so that they are centered on the object using a view finder.
 
@@ -652,7 +652,7 @@ def orient_to_object(
             view_sensor_id: view finder id (str)
             target_semantic_id: the integer corresponding to the semantic ID
                 of the target object that we will try to fixate on
-            multi_objects_present: whether there are multiple objects present in the
+            multiple_objects_present: whether there are multiple objects present in the
                 scene.
 
         Returns:
@@ -664,7 +664,7 @@ def orient_to_object(
         sem3d_obs = raw_observation[self.agent_id][view_sensor_id]["semantic_3d"]
         sem_obs = sem3d_obs[:, 3].reshape(obs_dim).astype(int)
 
-        if not multi_objects_present:
+        if not multiple_objects_present:
             sem_obs[sem_obs > 0] = target_semantic_id
 
         logging.debug("Searching for object")
@@ -676,7 +676,10 @@ def orient_to_object(
             return [], True
 
         relative_location = self.find_location_to_look_at(
-            sem3d_obs, image_shape=obs_dim, target_semantic_id=target_semantic_id
+            sem3d_obs,
+            image_shape=obs_dim,
+            target_semantic_id=target_semantic_id,
+            multiple_objects_present=multiple_objects_present,
         )
         down_amount, left_amount = self.compute_look_amounts(relative_location)
 
@@ -702,7 +705,13 @@ def compute_look_amounts(self, relative_location):
         left_amount = np.degrees(np.arctan2(relative_location[0], relative_location[2]))
         return down_amount, left_amount
 
-    def find_location_to_look_at(self, sem3d_obs, image_shape, target_semantic_id):
+    def find_location_to_look_at(
+        self,
+        sem3d_obs: np.ndarray,
+        image_shape: Tuple[int, int],
+        target_semantic_id: int,
+        multiple_objects_present: bool,
+    ) -> np.ndarray:
         """Takes in a semantic 3D observation and returns an x,y,z location.
 
         The location is on the object and surrounded by pixels that are also on
@@ -715,6 +724,8 @@ def find_location_to_look_at(self, sem3d_obs, image_shape, target_semantic_id):
             image_shape: the shape of the camera image
             target_semantic_id: the semantic ID of the target object we'd like to
                 saccade on to
+            multi_objects_present: whether there are multiple objects present in the
+                scene.
 
         Returns:
             relative_location: the x,y,z distance from camera to pixel with max
@@ -723,6 +734,9 @@ def find_location_to_look_at(self, sem3d_obs, image_shape, target_semantic_id):
         sem3d_obs_image = sem3d_obs.reshape((image_shape[0], image_shape[1], 4))
         on_object_image = sem3d_obs_image[:, :, 3]
 
+        if not multiple_objects_present:
+            on_object_image[on_object_image > 0] = target_semantic_id
+
         on_object_image = on_object_image == target_semantic_id
         on_object_image = on_object_image.astype(float)
 

From 0035622941d457395d2bf7ef3cedd0e7997a65ac Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Tue, 24 Dec 2024 17:36:01 -0500
Subject: [PATCH 13/21] Update tests

 - Explicitly use semantic sensors for habitat_transform_test.py
 - Drop more non-matching columns between parallel and non-parallel experiments for run_parallel_test.py since not having a semantic sensor modifies reporting of stepwise_performance and stepwise_target_object.
---
 benchmarks/sandbox.py                | 24 ------------------------
 tests/unit/habitat_transform_test.py |  2 ++
 tests/unit/run_parallel_test.py      | 15 +++++++++------
 3 files changed, 11 insertions(+), 30 deletions(-)
 delete mode 100644 benchmarks/sandbox.py

diff --git a/benchmarks/sandbox.py b/benchmarks/sandbox.py
deleted file mode 100644
index 95e7adad..00000000
--- a/benchmarks/sandbox.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from tbp.monty.frameworks.run_env import setup_env
-
-setup_env()
-from configs import CONFIGS  # noqa: E402
-
-from tbp.monty.frameworks.run import main  # noqa: E402
-
-main(all_configs=CONFIGS, experiments=["base_config_10distinctobj_surf_agent"])
-
-"""
-Unsupervised
-----------------------------------------------------------------------------------------
-"""
-
-# from tbp.monty.frameworks.utils.logging_utils import (
-#     load_stats,
-#     print_unsupervised_stats,
-# )
-
-# exp_path = "/Users/sknudstrup/tbp/results/monty/projects/monty_runs/surf_agent_unsupervised_10distinctobj"
-# stats, _, _, _ = load_stats(
-#     exp_path, load_eval=False, load_detailed=False, load_models=False
-# )
-# print_unsupervised_stats(stats, 10)
diff --git a/tests/unit/habitat_transform_test.py b/tests/unit/habitat_transform_test.py
index e135a682..a7f228c2 100644
--- a/tests/unit/habitat_transform_test.py
+++ b/tests/unit/habitat_transform_test.py
@@ -109,6 +109,7 @@ def test_semantic_3d_local(self):
             agent_id=AGENT_ID,
             sensor_ids=[SENSOR_ID],
             resolutions=[resolution],
+            use_semantic_sensor=True,
         )
         obs = transform(md_obs)
         module_obs = obs[AGENT_ID][SENSOR_ID]
@@ -171,6 +172,7 @@ def setup_test_data(
             resolutions=[resolution],
             world_coord=True,
             get_all_points=False,
+            use_semantic_sensor=True,
         )
 
         obs = transform(md_obs, state=mock_state)
diff --git a/tests/unit/run_parallel_test.py b/tests/unit/run_parallel_test.py
index 511fd33e..c2dab0de 100644
--- a/tests/unit/run_parallel_test.py
+++ b/tests/unit/run_parallel_test.py
@@ -272,8 +272,9 @@ def test_parallel_runs_n_epochs_lt(self):
         scsv = pd.read_csv(os.path.join(eval_dir, "eval_stats.csv"))
         pcsv = pd.read_csv(os.path.join(parallel_eval_dir, "eval_stats.csv"))
 
-        scsv.drop(columns="time", inplace=True)
-        pcsv.drop(columns="time", inplace=True)
+        for col in ["time", "stepwise_performance", "stepwise_target_object"]:
+            scsv.drop(columns=col, inplace=True)
+            pcsv.drop(columns=col, inplace=True)
 
         self.assertTrue(pcsv.equals(scsv))
 
@@ -314,8 +315,9 @@ def test_parallel_runs_n_epochs_lt(self):
         scsv_lt = pd.read_csv(os.path.join(eval_dir_lt, "eval_stats.csv"))
         pcsv_lt = pd.read_csv(os.path.join(parallel_eval_dir_lt, "eval_stats.csv"))
 
-        scsv_lt.drop(columns="time", inplace=True)
-        pcsv_lt.drop(columns="time", inplace=True)
+        for col in ["time", "stepwise_performance", "stepwise_target_object"]:
+            scsv_lt.drop(columns=col, inplace=True)
+            pcsv_lt.drop(columns=col, inplace=True)
 
         self.assertTrue(pcsv_lt.equals(scsv_lt))
 
@@ -356,8 +358,9 @@ def test_parallel_runs_n_epochs_lt(self):
         scsv_gt = pd.read_csv(os.path.join(eval_dir_gt, "eval_stats.csv"))
         pcsv_gt = pd.read_csv(os.path.join(parallel_eval_dir_gt, "eval_stats.csv"))
 
-        scsv_gt.drop(columns="time", inplace=True)
-        pcsv_gt.drop(columns="time", inplace=True)
+        for col in ["time", "stepwise_performance", "stepwise_target_object"]:
+            scsv_gt.drop(columns=col, inplace=True)
+            pcsv_gt.drop(columns=col, inplace=True)
 
         self.assertTrue(pcsv_gt.equals(scsv_gt))
 

From 4186fae25eee8b79597a72e017fe4e684383c901 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Wed, 25 Dec 2024 00:19:04 -0500
Subject: [PATCH 14/21] Update motor_policies.py

docstrings
---
 src/tbp/monty/frameworks/models/motor_policies.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tbp/monty/frameworks/models/motor_policies.py b/src/tbp/monty/frameworks/models/motor_policies.py
index 9446bf35..97e7bc46 100644
--- a/src/tbp/monty/frameworks/models/motor_policies.py
+++ b/src/tbp/monty/frameworks/models/motor_policies.py
@@ -724,7 +724,7 @@ def find_location_to_look_at(
             image_shape: the shape of the camera image
             target_semantic_id: the semantic ID of the target object we'd like to
                 saccade on to
-            multi_objects_present: whether there are multiple objects present in the
+            multiple_objects_present: whether there are multiple objects present in the
                 scene.
 
         Returns:
@@ -1344,7 +1344,7 @@ def get_perc_on_obj_semantic(semantic_obs, semantic_id=0):
 
     Args:
         semantic_obs: Semantic image observation.
-        sematic_id: Semantic ID of the target object.
+        semantic_id: Semantic ID of the target object.
 
     Returns:
         perc_on_obj: Percentage of pixels on the object.

From ede3007d72305006ab802d882849ef443537de54 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 30 Dec 2024 17:09:33 -0500
Subject: [PATCH 15/21] Update policy_test.py

Don't use ycb data path for multi-object dataset args.
---
 tests/unit/policy_test.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/unit/policy_test.py b/tests/unit/policy_test.py
index c58c2d4f..cb447fd1 100644
--- a/tests/unit/policy_test.py
+++ b/tests/unit/policy_test.py
@@ -45,6 +45,7 @@
 )
 from tbp.monty.frameworks.config_utils.make_dataset_configs import (
     EnvInitArgsFiveLMMount,
+    EnvInitArgsPatchViewFinderMultiObjectMount,
     EnvInitArgsPatchViewMount,
     EnvInitArgsSurfaceViewMount,
     EnvironmentDataloaderMultiObjectArgs,
@@ -314,7 +315,11 @@ def setUp(self):
         self.poor_initial_view_multi_object_config.update(
             # For multi-objects, we test get good view at evaluation, because in
             # Monty we don't currently train with multiple objects in the environment
-            dataset_args=PatchViewFinderMultiObjectMountHabitatDatasetArgs(),
+            dataset_args=PatchViewFinderMultiObjectMountHabitatDatasetArgs(
+                env_init_args=EnvInitArgsPatchViewFinderMultiObjectMount(
+                    data_path=None
+                ).__dict__,
+            ),
             eval_dataloader_args=EnvironmentDataloaderMultiObjectArgs(
                 object_names=dict(
                     targets_list=["cubeSolid"],

From 4c343bef357b29e0e687d7ed62cf3349eb2616b0 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 30 Dec 2024 20:41:05 -0500
Subject: [PATCH 16/21] Update embodied_data.py

Add return values to get_good_view and get_good_view_with_patch_refinement so we can raise an assertion error if we start an episode off-object.
---
 .../frameworks/environments/embodied_data.py  | 40 ++++++++++---------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/tbp/monty/frameworks/environments/embodied_data.py b/src/tbp/monty/frameworks/environments/embodied_data.py
index 5c536883..10425766 100644
--- a/src/tbp/monty/frameworks/environments/embodied_data.py
+++ b/src/tbp/monty/frameworks/environments/embodied_data.py
@@ -482,7 +482,10 @@ def __next__(self):
     def pre_episode(self):
         super().pre_episode()
         if not self.dataset.env._agents[0].action_space_type == "surface_agent":
-            self.get_good_view_with_patch_refinement()
+            on_object = self.get_good_view_with_patch_refinement()
+            assert (
+                on_object
+            ), "Primary target must be visible at the start of the episode"
 
     def first_step(self):
         """Carry out particular motor-system state updates required on the first step.
@@ -510,7 +513,7 @@ def first_step(self):
 
     def get_good_view(
         self, view_sensor_id: str, allow_translation: bool = True
-    ) -> None:
+    ) -> bool:
         """Policy to get a good view of the object before an episode starts.
 
         Used by the distant agent to find the initial view of an object at the
@@ -539,6 +542,9 @@ def get_good_view(
                 the motor systems's `move_close_enough` method. If `False`, only
                 orientienting movements are performed. Default is `True`.
 
+        Returns:
+            Whether the sensor is on the object.
+
         TODO M : move most of this to the motor systems, shouldn't be in embodied_data
             class
         """
@@ -587,20 +593,17 @@ def get_good_view(
             for action in actions:
                 self._observation, self.motor_system.state = self.dataset[action]
 
-        # # Final check that we're on the object
-        # TODO add this back later : at the moment we sometimes just don't have the
-        # object visible here, e.g. the spoon, regardless of whether it's a multi
-        # object experiment or not; orient_to_object seems to sometimes fail due to
-        # the Gaussean filtering getting us to move to somewhere that's not on the
-        # actual object
-        # _, on_object = self.motor_system.orient_to_object(
-        #     self._observation,
-        #     view_sensor_id,
-        #     target_semantic_id=self.primary_target["semantic_id"],
-        # )
-        # assert on_object, "Primary target must be visible at the start of the episode"
+        # Final check that we're on the object. May be used by calling function
+        # to raise an error.
+        _, on_object = self.motor_system.orient_to_object(
+            self._observation,
+            view_sensor_id,
+            target_semantic_id=self.primary_target["semantic_id"],
+            multiple_objects_present=multiple_objects_present,
+        )
+        return on_object
 
-    def get_good_view_with_patch_refinement(self) -> None:
+    def get_good_view_with_patch_refinement(self) -> bool:
         """Policy to get a good view of the object for the central patch.
 
         Used by the distant agent to move and orient toward an object such that the
@@ -612,13 +615,16 @@ def get_good_view_with_patch_refinement(self) -> None:
         Also currently used by the distant agent after a "jump" has been initialized
         by a model-based policy.
 
+        Returns:
+            Whether the sensor is on the object.
 
         """
         self.get_good_view("view_finder")
         for patch_id in ("patch", "patch_0"):
             if patch_id in self._observation["agent_id_0"].keys():
-                self.get_good_view(patch_id, allow_translation=False)
+                on_object = self.get_good_view(patch_id, allow_translation=False)
                 break
+        return on_object
 
     def execute_jump_attempt(self):
         """Attempt a hypothesis-testing "jump" onto a location of the object.
@@ -745,8 +751,6 @@ def handle_successful_jump(self):
 
         else:
             self.get_good_view_with_patch_refinement()
-            # TODO implement better way to get better view after the jump that isn't
-            # "cheating" by using get_good_view (which uses the semantic sensor)
 
     def handle_failed_jump(self, pre_jump_state, first_sensor):
         """Deal with the results of a failed hypothesis-testing jump.

From 9184f60878e0fcb5bfde9f718e8b9979a3612b9b Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Mon, 30 Dec 2024 20:41:17 -0500
Subject: [PATCH 17/21] Update pretraining_experiments.py

Prepare to pretrain new models.
---
 benchmarks/configs/pretraining_experiments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/configs/pretraining_experiments.py b/benchmarks/configs/pretraining_experiments.py
index a6e47de5..03365db1 100644
--- a/benchmarks/configs/pretraining_experiments.py
+++ b/benchmarks/configs/pretraining_experiments.py
@@ -59,7 +59,7 @@
 monty_models_dir = os.getenv("MONTY_MODELS")
 
 fe_pretrain_dir = os.path.expanduser(
-    os.path.join(monty_models_dir, "pretrained_ycb_v9")
+    os.path.join(monty_models_dir, "pretrained_ycb_v10")
 )
 
 pre_surf_agent_visual_training_model_path = os.path.join(

From 1eaa595e88df8d392b1067f56410b042365d52f1 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Tue, 7 Jan 2025 19:39:20 -0500
Subject: [PATCH 18/21] Add num_distractors logic for pre-episode assertion

Also typos
---
 benchmarks/configs/ycb_experiments.py         |   2 +-
 benchmarks/sandbox.py                         | 150 ++++++++++++++++++
 .../environment_utils/transforms.py           | 145 ++++++++++++++---
 .../frameworks/environments/embodied_data.py  |  36 +++--
 .../monty/frameworks/models/motor_policies.py |   7 +-
 5 files changed, 300 insertions(+), 40 deletions(-)
 create mode 100644 benchmarks/sandbox.py

diff --git a/benchmarks/configs/ycb_experiments.py b/benchmarks/configs/ycb_experiments.py
index 4988b1d9..f44d4124 100644
--- a/benchmarks/configs/ycb_experiments.py
+++ b/benchmarks/configs/ycb_experiments.py
@@ -108,7 +108,7 @@
 # v8 : Using separate graph per input channel
 # v9 : Using models trained on 14 unique rotations
 fe_pretrain_dir = os.path.expanduser(
-    os.path.join(monty_models_dir, "pretrained_ycb_v9")
+    os.path.join(monty_models_dir, "pretrained_ycb_v10")
 )
 
 model_path_10distinctobj = os.path.join(
diff --git a/benchmarks/sandbox.py b/benchmarks/sandbox.py
new file mode 100644
index 00000000..328aeb43
--- /dev/null
+++ b/benchmarks/sandbox.py
@@ -0,0 +1,150 @@
+import os
+
+from tbp.monty.frameworks.run import main  # noqa: E402
+from tbp.monty.frameworks.run_env import setup_env
+
+# os.environ["saveplots"] = "True"
+
+setup_env()
+from configs import CONFIGS  # noqa: E402
+
+main(all_configs=CONFIGS, experiments=["randrot_noise_sim_on_scan_monty_world"])
+
+# from numbers import Number
+# from typing import Tuple
+
+# import matplotlib.pyplot as plt
+# import numpy as np
+
+# from tbp.monty.frameworks.environment_utils.transforms import DepthTo3DLocations
+
+
+# def get_on_surface_th(
+#     depth_patch,
+#     min_depth_range: Number,
+#     default_on_surface_th: Number,
+# ) -> Tuple[Number, bool]:
+#     """Return a depth threshold if we have a bimodal depth distribution.
+
+#     If the depth values are in a large enough range (> min_depth_range) we may
+#     be looking at more than one surface within our patch. This could either be
+#     two disjoint surfaces of the object or the object and the background.
+
+#     To figure out if we have two disjoint sets of depth values we look at the
+#     histogram and check for empty bins in the middle. The center of the empty
+#     part if the histogram will be defined as the threshold.
+
+#     Next, we want to check if we should use the depth values above or below the
+#     threshold. Currently this is done by looking which side of the distribution
+#     is larger (occupies more space in the patch). Alternatively we could check
+#     which side the depth at the center of the patch is on. I'm not sure what would
+#     be better.
+
+#     Lastly, if we do decide to use the depth points that are further away, we need
+#     to make sure they are not the points that are off the object. Currently this is
+#     just done with a simple heuristic (depth difference < 0.1) but in the future we
+#     will probably have to find a better solution for this.
+
+#     Args:
+#         depth_patch: sensor patch observations of depth
+#         min_depth_range: minimum range of depth values to even be considered
+#         default_on_surface_th: default threshold to use if no bimodal distribution
+#             is found
+#     Returns:
+#         threshold and whether we want to use values above or below threshold
+#     """
+#     depths = np.asarray(depth_patch).flatten()
+#     flip_sign = False
+#     th = default_on_surface_th
+#     if (max(depths) - min(depths)) > min_depth_range:
+#         # only check for bimodal distribution if we have a large enough
+#         # range in depth values
+#         height, bins = np.histogram(
+#             np.array(depth_patch).flatten(), bins=8, density=False
+#         )
+#         gap = np.where(height == 0)[0]
+#         if len(gap) > 0:
+#             # There is a bimodal distribution
+#             gap_center = len(gap) // 2
+#             th_id = gap[gap_center]
+#             th = bins[th_id]
+#             # Check which side of the distribution we should use
+#             if np.sum(height[:th_id]) < np.sum(height[th_id:]):
+#                 # more points in the patch are on the further away surface
+#                 if (bins[-1] - bins[0]) < 0.1:
+#                     # not too large distance between depth values -> avoid
+#                     # flipping sign when off object
+#                     flip_sign = True
+#     return th, flip_sign
+
+
+# def get_semantic_from_depth(
+#     depth_patch: np.ndarray,
+#     default_on_surface_th: Number,
+# ) -> np.ndarray:
+#     """Return semantic patch information from heuristics on depth patch.
+
+#     Args:
+#         depth_patch: sensor patch observations of depth
+#         default_on_surface_th: default threshold to use if no bimodal distribution
+#             is found
+#     Returns:
+#         sensor patch shaped info about whether each pixel is on surface of not
+#     """
+#     # avoid large range when seeing the table (goes up to almost 100 and then
+#     # just using 8 bins will not work anymore)
+#     depth_patch = np.array(depth_patch)
+#     depth_patch[depth_patch > 1] = 1.0
+
+#     # If all depth values are at maximum (1.0), then we are automatically
+#     # off-object.
+#     if np.all(depth_patch == 1.0):
+#         return np.zeros_like(depth_patch, dtype=bool)
+
+#     # Compute the on-suface depth threshold (and whether we need to flip the
+#     # sign), and apply it to the depth to get the semantic patch.
+#     th, flip_sign = get_on_surface_th(
+#         depth_patch,
+#         min_depth_range=0.01,
+#         default_on_surface_th=default_on_surface_th,
+#     )
+#     print("flip_sign", flip_sign)
+#     if flip_sign is False:
+#         semantic_patch = depth_patch < th
+#     else:
+#         semantic_patch = depth_patch > th
+#     return semantic_patch
+
+
+# depth_path = "/Users/sknudstrup/depth.npy"
+# depth = np.load(depth_path)
+# # t = DepthTo3DLocations()
+# clip_value = 0.05
+# default_on_surface_th = clip_value
+
+# # semantic = get_semantic_from_depth(depth, default_on_surface_th)
+# # plt.imshow(semantic)
+# # plt.colorbar()
+# # plt.show()
+# depth_patch = depth.copy()
+# min_depth_range = 0.01
+# depths = np.asarray(depth_patch).flatten()
+# flip_sign = False
+# th = default_on_surface_th
+# if (max(depths) - min(depths)) > min_depth_range:
+#     # only check for bimodal distribution if we have a large enough
+#     # range in depth values
+#     height, bins = np.histogram(np.array(depth_patch).flatten(), bins=8, density=False)
+#     gap = np.where(height == 0)[0]
+#     if len(gap) > 0:
+#         # There is a bimodal distribution
+#         gap_center = len(gap) // 2
+#         th_id = gap[gap_center]
+#         th = bins[th_id]
+#         # Check which side of the distribution we should use
+#         if np.sum(height[:th_id]) < np.sum(height[th_id:]):
+#             # more points in the patch are on the further away surface
+#             if (bins[-1] - bins[0]) < 0.1:
+#                 # not too large distance between depth values -> avoid
+#                 # flipping sign when off object
+#                 flip_sign = True
diff --git a/src/tbp/monty/frameworks/environment_utils/transforms.py b/src/tbp/monty/frameworks/environment_utils/transforms.py
index e7944459..fd0bfc46 100644
--- a/src/tbp/monty/frameworks/environment_utils/transforms.py
+++ b/src/tbp/monty/frameworks/environment_utils/transforms.py
@@ -7,9 +7,12 @@
 # license that can be found in the LICENSE file or at
 # https://opensource.org/licenses/MIT.
 
+import os
 from numbers import Number
+from pathlib import Path
 from typing import Tuple
 
+import matplotlib.pyplot as plt
 import numpy as np
 import quaternion as qt
 import scipy
@@ -310,28 +313,42 @@ def __init__(
     def __call__(self, observations, state=None):
         for i, sensor_id in enumerate(self.sensor_ids):
             agent_obs = observations[self.agent_id][sensor_id]
+            depth_obs = agent_obs["depth"]
+
+            # Initialize rudimentary semantic mask that makes off-object pixels 0.
+            if "semantic" in agent_obs.keys():
+                semantic_added = False
+            else:
+                semantic_added = True
+                semantic_mask = np.ones_like(depth_obs, dtype=int)
+                semantic_mask[depth_obs >= 1] = 0
+                agent_obs["semantic"] = semantic_mask
+                semantic_added = True
+
             if i in self.depth_clip_sensors:
+                # Clip depth and semantic mask. Modifies agent_obs["depth"] and
+                # semantic_mask in place. Also modifies agent_obs["semantic"] in place
+                # if it exists.
                 self.clip(agent_obs)
                 default_on_surface_th = self.clip_value
             else:
-                default_on_surface_th = 1000
-            depth_obs = agent_obs["depth"]
+                default_on_surface_th = 10.0
+
             # if applying depth clip, then do not use depth for semantic info
             # because the depth surface now includes a sheet of pixels all
             # set to the clip_value, and this sheet can confuse the
             # get_semantic_from_depth function into thinking that it is the object
             if self.depth_clip_sensors and self.use_semantic_sensor:
                 semantic_obs = agent_obs["semantic"]
-            elif self.use_semantic_sensor:
+            else:
                 surface_obs = self.get_semantic_from_depth(
-                    depth_obs, default_on_surface_th
+                    depth_obs,
+                    default_on_surface_th,
+                    agent_obs["semantic"],
                 )
                 # set pixel to 1 if it is on the main surface and on the object
-                semantic_obs = agent_obs["semantic"] * surface_obs
-            else:
-                semantic_obs = self.get_semantic_from_depth(
-                    depth_obs, default_on_surface_th
-                )
+                # semantic_obs = agent_obs["semantic"] * surface_obs
+                semantic_obs = surface_obs
 
             # Approximate true world coordinates
             x, y = np.meshgrid(
@@ -397,6 +414,14 @@ def __call__(self, observations, state=None):
             # Add transformed observation to existing dict. We don't need to create
             # a deepcopy because we are appending a new observation
             observations[self.agent_id][sensor_id]["semantic_3d"] = semantic_3d
+
+        # Check if plots should be saved
+        if os.environ.get("saveplots"):
+            do_plots(observations)
+
+        if semantic_added:
+            del agent_obs["semantic"]
+
         return observations
 
     def clip(self, agent_obs):
@@ -404,8 +429,7 @@ def clip(self, agent_obs):
 
         Set the values of 0 (infinite depth) to the clip value.
         """
-        if "semantic" in agent_obs.keys():
-            agent_obs["semantic"][agent_obs["depth"] >= self.clip_value] = 0
+        agent_obs["semantic"][agent_obs["depth"] >= self.clip_value] = 0
         agent_obs["depth"][agent_obs["depth"] > self.clip_value] = self.clip_value
         agent_obs["depth"][agent_obs["depth"] == 0] = self.clip_value
 
@@ -414,6 +438,7 @@ def get_on_surface_th(
         depth_patch,
         min_depth_range: Number,
         default_on_surface_th: Number,
+        semantic_mask: np.ndarray,
     ) -> Tuple[Number, bool]:
         """Return a depth threshold if we have a bimodal depth distribution.
 
@@ -441,9 +466,14 @@ def get_on_surface_th(
             min_depth_range: minimum range of depth values to even be considered
             default_on_surface_th: default threshold to use if no bimodal distribution
                 is found
+            semantic_mask: binary mask indicating on-object locations
         Returns:
             threshold and whether we want to use values above or below threshold
         """
+        center_loc = (depth_patch.shape[0] // 2, depth_patch.shape[1] // 2)
+        depth_center = depth_patch[center_loc[0], center_loc[1]]
+        semantic_center = semantic_mask[center_loc[0], center_loc[1]]
+
         depths = np.asarray(depth_patch).flatten()
         flip_sign = False
         th = default_on_surface_th
@@ -459,17 +489,18 @@ def get_on_surface_th(
                 gap_center = len(gap) // 2
                 th_id = gap[gap_center]
                 th = bins[th_id]
-                # Check which side of the distribution we should use
-                if np.sum(height[:th_id]) < np.sum(height[th_id:]):
-                    # more points in the patch are on the further away surface
-                    if (bins[-1] - bins[0]) < 0.1:
-                        # not too large distance between depth values -> avoid
-                        # flipping sign when off object
-                        flip_sign = True
+                if depth_center > th and semantic_center > 0:
+                    # if the FOV's center is on the further away surface and the FOV's
+                    # center is on-object, then we want to use the further-away surface.
+                    flip_sign = True
+
         return th, flip_sign
 
     def get_semantic_from_depth(
-        self, depth_patch: np.ndarray, default_on_surface_th: Number
+        self,
+        depth_patch: np.ndarray,
+        default_on_surface_th: Number,
+        semantic_mask: np.ndarray,
     ) -> np.ndarray:
         """Return semantic patch information from heuristics on depth patch.
 
@@ -477,6 +508,7 @@ def get_semantic_from_depth(
             depth_patch: sensor patch observations of depth
             default_on_surface_th: default threshold to use if no bimodal distribution
                 is found
+            semantic_mask: binary mask indicating on-object locations
         Returns:
             sensor patch shaped info about whether each pixel is on surface of not
         """
@@ -487,7 +519,7 @@ def get_semantic_from_depth(
 
         # If all depth values are at maximum (1.0), then we are automatically
         # off-object.
-        if np.all(depth_patch == 1.0):
+        if np.all(depth_patch >= 1.0):
             return np.zeros_like(depth_patch, dtype=bool)
 
         # Compute the on-suface depth threshold (and whether we need to flip the
@@ -496,9 +528,82 @@ def get_semantic_from_depth(
             depth_patch,
             min_depth_range=0.01,
             default_on_surface_th=default_on_surface_th,
+            semantic_mask=semantic_mask,
         )
         if flip_sign is False:
             semantic_patch = depth_patch < th
         else:
             semantic_patch = depth_patch > th
+
+        semantic_patch = semantic_patch * semantic_mask
         return semantic_patch
+
+def do_plots(observations):
+    fig, axes = plt.subplots(2, 3, figsize=[7.02, 4])
+
+    sensor_id = "view_finder"
+    agent_obs = observations["agent_id_0"][sensor_id]
+    rgba_patch = agent_obs["rgba"]
+    depth_patch = agent_obs["depth"]
+    semantic_patch_1d = agent_obs["semantic_3d"][:, 3]
+    semantic_patch = semantic_patch_1d.reshape(
+        depth_patch.shape[0], depth_patch.shape[1]
+    )
+
+    ax = axes[0, 0]
+    ax.imshow(rgba_patch)
+    ax.set_title("RGBA")
+
+    ax = axes[0, 1]
+    vmin = vmax = None
+    # if sensor_id == "patch":
+    # vmin, vmax = 0.0, 0.5
+    im = ax.imshow(depth_patch, cmap="gray", vmin=vmin, vmax=vmax)
+    plt.colorbar(im, ax=ax)
+    ax.set_title(f"Depth")
+
+    ax = axes[0, 2]
+    # im = ax.imshow(semantic_patch, cmap="gray", vmin=0, vmax=1)
+    im = ax.imshow(semantic_patch)
+    plt.colorbar(im, ax=ax)
+    center = semantic_patch[32, 32]
+    on_center = center > 0.0
+    ax.set_title(f"Semantic: on_center={on_center}")
+
+    sensor_id = "patch"
+    agent_obs = observations["agent_id_0"][sensor_id]
+    rgba_patch = agent_obs["rgba"]
+    depth_patch = agent_obs["depth"]
+    semantic_patch_1d = agent_obs["semantic_3d"][:, 3]
+    semantic_patch = semantic_patch_1d.reshape(
+        depth_patch.shape[0], depth_patch.shape[1]
+    )
+
+    ax = axes[1, 0]
+    ax.imshow(rgba_patch)
+    ax.set_title("RGBA")
+
+    ax = axes[1, 1]
+    vmin = vmax = None
+    # if sensor_id == "patch":
+    # vmin, vmax = 0.0, 0.05
+    im = ax.imshow(depth_patch, cmap="gray", vmin=vmin, vmax=vmax)
+    plt.colorbar(im, ax=ax)
+    ax.set_title(f"Depth")
+
+    ax = axes[1, 2]
+    # im = ax.imshow(semantic_patch, cmap="gray", vmin=0, vmax=1)
+    im = ax.imshow(semantic_patch)
+    plt.colorbar(im, ax=ax)
+    center = semantic_patch[32, 32]
+    on_center = center > 0.0
+    ax.set_title(f"Semantic: on_center={on_center}")
+
+    for ax in axes.flatten():
+        ax.axis("off")
+    fig.tight_layout(pad=0.1)
+    figdir = Path("/Users/sknudstrup/figs")
+    figdir.mkdir(parents=True, exist_ok=True)
+    n_files = len(list(figdir.glob("*.png")))
+
+    fig.savefig(figdir / f"{n_files}.png")
diff --git a/src/tbp/monty/frameworks/environments/embodied_data.py b/src/tbp/monty/frameworks/environments/embodied_data.py
index 10425766..d0d4a3ec 100644
--- a/src/tbp/monty/frameworks/environments/embodied_data.py
+++ b/src/tbp/monty/frameworks/environments/embodied_data.py
@@ -223,7 +223,7 @@ def __init__(self, object_names, object_init_sampler, *args, **kwargs):
             self.object_names = object_names
             # Return an (ordered) list of unique items:
             self.source_object_list = list(dict.fromkeys(object_names))
-            self.num_distactors = 0
+            self.num_distractors = 0
         elif isinstance(object_names, dict):
             # TODO when we want more advanced multi-object experiments, update these
             # arguments along with the Object Initializers so that we can easily
@@ -233,7 +233,7 @@ def __init__(self, object_names, object_init_sampler, *args, **kwargs):
             self.source_object_list = list(
                 dict.fromkeys(object_names["source_object_list"])
             )
-            self.num_distactors = object_names["num_distractors"]
+            self.num_distractors = object_names["num_distractors"]
         else:
             raise ValueError("Object names should be a list or dictionary")
         self.create_semantic_mapping()
@@ -327,7 +327,7 @@ def change_object_by_idx(self, idx):
             name=self.object_names[idx], **init_params
         )
 
-        if self.num_distactors > 0:
+        if self.num_distractors > 0:
             self.add_distractor_objects(
                 primary_target_obj,
                 init_params,
@@ -366,7 +366,7 @@ def add_distractor_objects(
             item for item in self.source_object_list if item != primary_target_name
         ]
 
-        for __ in range(self.num_distactors):
+        for __ in range(self.num_distractors):
             new_init_params = copy.deepcopy(init_params)
 
             new_obj_label = self.rng.choice(sampling_list)
@@ -482,10 +482,12 @@ def __next__(self):
     def pre_episode(self):
         super().pre_episode()
         if not self.dataset.env._agents[0].action_space_type == "surface_agent":
-            on_object = self.get_good_view_with_patch_refinement()
-            assert (
-                on_object
-            ), "Primary target must be visible at the start of the episode"
+            on_target_object = self.get_good_view_with_patch_refinement()
+            if self.num_distractors == 0:
+                # Only perform this check if we aren't doing multi-object experiments.
+                assert (
+                    on_target_object
+                ), "Primary target must be visible at the start of the episode"
 
     def first_step(self):
         """Carry out particular motor-system state updates required on the first step.
@@ -551,15 +553,15 @@ def get_good_view(
         # TODO break up this method so that there is less code duplication
         # Start by ensuring the center of the patch is covering the primary target
         # object before we start moving forward; only done for multi-object experiments
-        multiple_objects_present = self.num_distactors > 0
+        multiple_objects_present = self.num_distractors > 0
         if multiple_objects_present:
-            actions, on_object = self.motor_system.orient_to_object(
+            actions, on_target_object = self.motor_system.orient_to_object(
                 self._observation,
                 view_sensor_id,
                 target_semantic_id=self.primary_target["semantic_id"],
                 multiple_objects_present=multiple_objects_present,
             )
-            if not on_object:
+            if not on_target_object:
                 for action in actions:
                     self._observation, self.motor_system.state = self.dataset[action]
 
@@ -583,25 +585,25 @@ def get_good_view(
                 )
 
         # Re-center ourselves (if necessary) after having moved closer
-        actions, on_object = self.motor_system.orient_to_object(
+        actions, on_target_object = self.motor_system.orient_to_object(
             self._observation,
             view_sensor_id,
             target_semantic_id=self.primary_target["semantic_id"],
             multiple_objects_present=multiple_objects_present,
         )
-        if not on_object:
+        if not on_target_object:
             for action in actions:
                 self._observation, self.motor_system.state = self.dataset[action]
 
         # Final check that we're on the object. May be used by calling function
         # to raise an error.
-        _, on_object = self.motor_system.orient_to_object(
+        _, on_target_object = self.motor_system.orient_to_object(
             self._observation,
             view_sensor_id,
             target_semantic_id=self.primary_target["semantic_id"],
             multiple_objects_present=multiple_objects_present,
         )
-        return on_object
+        return on_target_object
 
     def get_good_view_with_patch_refinement(self) -> bool:
         """Policy to get a good view of the object for the central patch.
@@ -622,9 +624,9 @@ def get_good_view_with_patch_refinement(self) -> bool:
         self.get_good_view("view_finder")
         for patch_id in ("patch", "patch_0"):
             if patch_id in self._observation["agent_id_0"].keys():
-                on_object = self.get_good_view(patch_id, allow_translation=False)
+                on_target_object = self.get_good_view(patch_id, allow_translation=False)
                 break
-        return on_object
+        return on_target_object
 
     def execute_jump_attempt(self):
         """Attempt a hypothesis-testing "jump" onto a location of the object.
diff --git a/src/tbp/monty/frameworks/models/motor_policies.py b/src/tbp/monty/frameworks/models/motor_policies.py
index 97e7bc46..d6403bf4 100644
--- a/src/tbp/monty/frameworks/models/motor_policies.py
+++ b/src/tbp/monty/frameworks/models/motor_policies.py
@@ -573,7 +573,7 @@ def move_close_enough(
 
         Returns:
             Tuple[Union[Action, None], bool]: The next action to take and whether the
-                episode is done
+                episode is done.
 
         Raises:
             ValueError: If the object is not visible
@@ -656,7 +656,10 @@ def orient_to_object(
                 scene.
 
         Returns:
-            Two actions to execute to put the patch on the object
+            A (possibly empty) list of actions and a bool that indicates whether we
+            are already on the target object. If we are not on the target object, the
+            list of actions is of length two and is composed of actions needed to get
+            us onto the target object.
         """
         # Reconstruct 2D semantic map.
         depth_image = raw_observation[self.agent_id][view_sensor_id]["depth"]

From 5c881400b8a85745f0c824b85e5f3a2bcf31966b Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Wed, 8 Jan 2025 17:11:53 -0500
Subject: [PATCH 19/21] Delete sandbox.py

---
 benchmarks/sandbox.py | 150 ------------------------------------------
 1 file changed, 150 deletions(-)
 delete mode 100644 benchmarks/sandbox.py

diff --git a/benchmarks/sandbox.py b/benchmarks/sandbox.py
deleted file mode 100644
index 328aeb43..00000000
--- a/benchmarks/sandbox.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import os
-
-from tbp.monty.frameworks.run import main  # noqa: E402
-from tbp.monty.frameworks.run_env import setup_env
-
-# os.environ["saveplots"] = "True"
-
-setup_env()
-from configs import CONFIGS  # noqa: E402
-
-main(all_configs=CONFIGS, experiments=["randrot_noise_sim_on_scan_monty_world"])
-
-# from numbers import Number
-# from typing import Tuple
-
-# import matplotlib.pyplot as plt
-# import numpy as np
-
-# from tbp.monty.frameworks.environment_utils.transforms import DepthTo3DLocations
-
-
-# def get_on_surface_th(
-#     depth_patch,
-#     min_depth_range: Number,
-#     default_on_surface_th: Number,
-# ) -> Tuple[Number, bool]:
-#     """Return a depth threshold if we have a bimodal depth distribution.
-
-#     If the depth values are in a large enough range (> min_depth_range) we may
-#     be looking at more than one surface within our patch. This could either be
-#     two disjoint surfaces of the object or the object and the background.
-
-#     To figure out if we have two disjoint sets of depth values we look at the
-#     histogram and check for empty bins in the middle. The center of the empty
-#     part if the histogram will be defined as the threshold.
-
-#     Next, we want to check if we should use the depth values above or below the
-#     threshold. Currently this is done by looking which side of the distribution
-#     is larger (occupies more space in the patch). Alternatively we could check
-#     which side the depth at the center of the patch is on. I'm not sure what would
-#     be better.
-
-#     Lastly, if we do decide to use the depth points that are further away, we need
-#     to make sure they are not the points that are off the object. Currently this is
-#     just done with a simple heuristic (depth difference < 0.1) but in the future we
-#     will probably have to find a better solution for this.
-
-#     Args:
-#         depth_patch: sensor patch observations of depth
-#         min_depth_range: minimum range of depth values to even be considered
-#         default_on_surface_th: default threshold to use if no bimodal distribution
-#             is found
-#     Returns:
-#         threshold and whether we want to use values above or below threshold
-#     """
-#     depths = np.asarray(depth_patch).flatten()
-#     flip_sign = False
-#     th = default_on_surface_th
-#     if (max(depths) - min(depths)) > min_depth_range:
-#         # only check for bimodal distribution if we have a large enough
-#         # range in depth values
-#         height, bins = np.histogram(
-#             np.array(depth_patch).flatten(), bins=8, density=False
-#         )
-#         gap = np.where(height == 0)[0]
-#         if len(gap) > 0:
-#             # There is a bimodal distribution
-#             gap_center = len(gap) // 2
-#             th_id = gap[gap_center]
-#             th = bins[th_id]
-#             # Check which side of the distribution we should use
-#             if np.sum(height[:th_id]) < np.sum(height[th_id:]):
-#                 # more points in the patch are on the further away surface
-#                 if (bins[-1] - bins[0]) < 0.1:
-#                     # not too large distance between depth values -> avoid
-#                     # flipping sign when off object
-#                     flip_sign = True
-#     return th, flip_sign
-
-
-# def get_semantic_from_depth(
-#     depth_patch: np.ndarray,
-#     default_on_surface_th: Number,
-# ) -> np.ndarray:
-#     """Return semantic patch information from heuristics on depth patch.
-
-#     Args:
-#         depth_patch: sensor patch observations of depth
-#         default_on_surface_th: default threshold to use if no bimodal distribution
-#             is found
-#     Returns:
-#         sensor patch shaped info about whether each pixel is on surface of not
-#     """
-#     # avoid large range when seeing the table (goes up to almost 100 and then
-#     # just using 8 bins will not work anymore)
-#     depth_patch = np.array(depth_patch)
-#     depth_patch[depth_patch > 1] = 1.0
-
-#     # If all depth values are at maximum (1.0), then we are automatically
-#     # off-object.
-#     if np.all(depth_patch == 1.0):
-#         return np.zeros_like(depth_patch, dtype=bool)
-
-#     # Compute the on-suface depth threshold (and whether we need to flip the
-#     # sign), and apply it to the depth to get the semantic patch.
-#     th, flip_sign = get_on_surface_th(
-#         depth_patch,
-#         min_depth_range=0.01,
-#         default_on_surface_th=default_on_surface_th,
-#     )
-#     print("flip_sign", flip_sign)
-#     if flip_sign is False:
-#         semantic_patch = depth_patch < th
-#     else:
-#         semantic_patch = depth_patch > th
-#     return semantic_patch
-
-
-# depth_path = "/Users/sknudstrup/depth.npy"
-# depth = np.load(depth_path)
-# # t = DepthTo3DLocations()
-# clip_value = 0.05
-# default_on_surface_th = clip_value
-
-# # semantic = get_semantic_from_depth(depth, default_on_surface_th)
-# # plt.imshow(semantic)
-# # plt.colorbar()
-# # plt.show()
-# depth_patch = depth.copy()
-# min_depth_range = 0.01
-# depths = np.asarray(depth_patch).flatten()
-# flip_sign = False
-# th = default_on_surface_th
-# if (max(depths) - min(depths)) > min_depth_range:
-#     # only check for bimodal distribution if we have a large enough
-#     # range in depth values
-#     height, bins = np.histogram(np.array(depth_patch).flatten(), bins=8, density=False)
-#     gap = np.where(height == 0)[0]
-#     if len(gap) > 0:
-#         # There is a bimodal distribution
-#         gap_center = len(gap) // 2
-#         th_id = gap[gap_center]
-#         th = bins[th_id]
-#         # Check which side of the distribution we should use
-#         if np.sum(height[:th_id]) < np.sum(height[th_id:]):
-#             # more points in the patch are on the further away surface
-#             if (bins[-1] - bins[0]) < 0.1:
-#                 # not too large distance between depth values -> avoid
-#                 # flipping sign when off object
-#                 flip_sign = True

From 23842cd5f1f28d427e6acfb9e245b8d855142a00 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Wed, 8 Jan 2025 17:31:52 -0500
Subject: [PATCH 20/21] Update transforms.py

---
 .../environment_utils/transforms.py           | 90 ++-----------------
 1 file changed, 6 insertions(+), 84 deletions(-)

diff --git a/src/tbp/monty/frameworks/environment_utils/transforms.py b/src/tbp/monty/frameworks/environment_utils/transforms.py
index fd0bfc46..75c161c1 100644
--- a/src/tbp/monty/frameworks/environment_utils/transforms.py
+++ b/src/tbp/monty/frameworks/environment_utils/transforms.py
@@ -7,21 +7,18 @@
 # license that can be found in the LICENSE file or at
 # https://opensource.org/licenses/MIT.
 
-import os
 from numbers import Number
-from pathlib import Path
 from typing import Tuple
 
-import matplotlib.pyplot as plt
 import numpy as np
 import quaternion as qt
 import scipy
 
 __all__ = [
-    "MissingToMaxDepth",
     "AddNoiseToRawDepthImage",
-    "GaussianSmoothing",
     "DepthTo3DLocations",
+    "GaussianSmoothing",
+    "MissingToMaxDepth",
 ]
 
 
@@ -315,7 +312,8 @@ def __call__(self, observations, state=None):
             agent_obs = observations[self.agent_id][sensor_id]
             depth_obs = agent_obs["depth"]
 
-            # Initialize rudimentary semantic mask that makes off-object pixels 0.
+            # Initialize rudimentary semantic mask needed to mask off-object pixels.
+            # If we add it here, we'll delete it before exiting the function.
             if "semantic" in agent_obs.keys():
                 semantic_added = False
             else:
@@ -341,14 +339,11 @@ def __call__(self, observations, state=None):
             if self.depth_clip_sensors and self.use_semantic_sensor:
                 semantic_obs = agent_obs["semantic"]
             else:
-                surface_obs = self.get_semantic_from_depth(
+                semantic_obs = self.get_semantic_from_depth(
                     depth_obs,
                     default_on_surface_th,
                     agent_obs["semantic"],
                 )
-                # set pixel to 1 if it is on the main surface and on the object
-                # semantic_obs = agent_obs["semantic"] * surface_obs
-                semantic_obs = surface_obs
 
             # Approximate true world coordinates
             x, y = np.meshgrid(
@@ -415,10 +410,7 @@ def __call__(self, observations, state=None):
             # a deepcopy because we are appending a new observation
             observations[self.agent_id][sensor_id]["semantic_3d"] = semantic_3d
 
-        # Check if plots should be saved
-        if os.environ.get("saveplots"):
-            do_plots(observations)
-
+        # Delete the added semantic mask if it was added.
         if semantic_added:
             del agent_obs["semantic"]
 
@@ -537,73 +529,3 @@ def get_semantic_from_depth(
 
         semantic_patch = semantic_patch * semantic_mask
         return semantic_patch
-
-def do_plots(observations):
-    fig, axes = plt.subplots(2, 3, figsize=[7.02, 4])
-
-    sensor_id = "view_finder"
-    agent_obs = observations["agent_id_0"][sensor_id]
-    rgba_patch = agent_obs["rgba"]
-    depth_patch = agent_obs["depth"]
-    semantic_patch_1d = agent_obs["semantic_3d"][:, 3]
-    semantic_patch = semantic_patch_1d.reshape(
-        depth_patch.shape[0], depth_patch.shape[1]
-    )
-
-    ax = axes[0, 0]
-    ax.imshow(rgba_patch)
-    ax.set_title("RGBA")
-
-    ax = axes[0, 1]
-    vmin = vmax = None
-    # if sensor_id == "patch":
-    # vmin, vmax = 0.0, 0.5
-    im = ax.imshow(depth_patch, cmap="gray", vmin=vmin, vmax=vmax)
-    plt.colorbar(im, ax=ax)
-    ax.set_title(f"Depth")
-
-    ax = axes[0, 2]
-    # im = ax.imshow(semantic_patch, cmap="gray", vmin=0, vmax=1)
-    im = ax.imshow(semantic_patch)
-    plt.colorbar(im, ax=ax)
-    center = semantic_patch[32, 32]
-    on_center = center > 0.0
-    ax.set_title(f"Semantic: on_center={on_center}")
-
-    sensor_id = "patch"
-    agent_obs = observations["agent_id_0"][sensor_id]
-    rgba_patch = agent_obs["rgba"]
-    depth_patch = agent_obs["depth"]
-    semantic_patch_1d = agent_obs["semantic_3d"][:, 3]
-    semantic_patch = semantic_patch_1d.reshape(
-        depth_patch.shape[0], depth_patch.shape[1]
-    )
-
-    ax = axes[1, 0]
-    ax.imshow(rgba_patch)
-    ax.set_title("RGBA")
-
-    ax = axes[1, 1]
-    vmin = vmax = None
-    # if sensor_id == "patch":
-    # vmin, vmax = 0.0, 0.05
-    im = ax.imshow(depth_patch, cmap="gray", vmin=vmin, vmax=vmax)
-    plt.colorbar(im, ax=ax)
-    ax.set_title(f"Depth")
-
-    ax = axes[1, 2]
-    # im = ax.imshow(semantic_patch, cmap="gray", vmin=0, vmax=1)
-    im = ax.imshow(semantic_patch)
-    plt.colorbar(im, ax=ax)
-    center = semantic_patch[32, 32]
-    on_center = center > 0.0
-    ax.set_title(f"Semantic: on_center={on_center}")
-
-    for ax in axes.flatten():
-        ax.axis("off")
-    fig.tight_layout(pad=0.1)
-    figdir = Path("/Users/sknudstrup/figs")
-    figdir.mkdir(parents=True, exist_ok=True)
-    n_files = len(list(figdir.glob("*.png")))
-
-    fig.savefig(figdir / f"{n_files}.png")

From edcd23093c3f977a5fa12f364b8d70cd335586c1 Mon Sep 17 00:00:00 2001
From: Scott Knudstrup <scottknudstrup@gmail.com>
Date: Thu, 9 Jan 2025 18:37:23 -0500
Subject: [PATCH 21/21] Update tables

---
 benchmarks/results/montymeetsworld.csv  | 12 +++---
 benchmarks/results/ycb_10objs.csv       | 24 +++++------
 benchmarks/results/ycb_77objs.csv       | 10 ++---
 benchmarks/results/ycb_unsupervised.csv |  6 +--
 docs/overview/benchmark-experiments.md  | 56 ++++++++++++-------------
 5 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/benchmarks/results/montymeetsworld.csv b/benchmarks/results/montymeetsworld.csv
index 2fb82124..18a64b64 100644
--- a/benchmarks/results/montymeetsworld.csv
+++ b/benchmarks/results/montymeetsworld.csv
@@ -1,7 +1,7 @@
 Experiment,% Correct,% Used MLH,Num Matching Steps,Rotation Error (radians),Run Time,Episode Run Time (s)
-randrot_noise_sim_on_scan_monty_world,80.00%,85.83%,437,0.94,54m,25s
-world_image_on_scanned_model,66.67%,87.50%,453,2.05,16m,19s
-dark_world_image_on_scanned_model,43.75%,77.08%,433,1.87,15m,18s
-bright_world_image_on_scanned_model,47.92%,83.33%,457,2.16,22m,27s
-hand_intrusion_world_image_on_scanned_model,54.17%,47.92%,333,1.79,11m,13s
-multi_object_world_image_on_scanned_model,41.67%,39.58%,298,1.67,10m,12s
\ No newline at end of file
+randrot_noise_sim_on_scan_monty_world,80.00%,80.83%,412,0.86,52m,24s
+world_image_on_scanned_model,72.92%,83.33%,442,2.15,18m,21s
+dark_world_image_on_scanned_model,35.42%,83.33%,430,1.81,16m,20s
+bright_world_image_on_scanned_model,43.75%,79.17%,428,1.87,18m,22s
+hand_intrusion_world_image_on_scanned_model,39.58%,56.25%,344,2.00,8m,10s
+multi_object_world_image_on_scanned_model,43.75%,52.08%,344,1.90,11m,14s
\ No newline at end of file
diff --git a/benchmarks/results/ycb_10objs.csv b/benchmarks/results/ycb_10objs.csv
index b004069a..3cb862ad 100644
--- a/benchmarks/results/ycb_10objs.csv
+++ b/benchmarks/results/ycb_10objs.csv
@@ -1,13 +1,13 @@
 Experiment,% Correct,% Used MLH,Num Matching Steps,Rotation Error (radians),Run Time,Episode Run Time (s)
-base_config_10distinctobj_dist_agent,99.29%,5.00%,34,0.27,6m,20s
-base_config_10distinctobj_surf_agent,100.00%,0.00%,28,0.17,4m,19s
-randrot_noise_10distinctobj_dist_agent,98.00%,6.00%,47,0.45,5m,31s
-randrot_noise_10distinctobj_dist_on_distm,100.00%,2.00%,36,0.26,4m,28s
-randrot_noise_10distinctobj_surf_agent,99.00%,0.00%,28,0.33,4m,27s
-randrot_10distinctobj_surf_agent,100.00%,0.00%,29,0.40,3m,19s
-randrot_noise_10distinctobj_5lms_dist_agent,100.00%,7.00%,52,0.86,18m,86s
-base_10simobj_surf_agent,95.00%,7.86%,70,0.16,8m,41s
-randrot_noise_10simobj_dist_agent,82.00%,40.00%,182,0.61,16m,116s
-randrot_noise_10simobj_surf_agent,90.00%,34.00%,180,0.50,24m,203s
-randomrot_rawnoise_10distinctobj_surf_agent,73.00%,78.00%,15,1.54,11m,12s
-base_10multi_distinctobj_dist_agent,69.29%,47.14%,25,0.82,1h6m,2s
\ No newline at end of file
+base_config_10distinctobj_dist_agent,99.29%,3.57%,34,0.24,5m,19s
+base_config_10distinctobj_surf_agent,100.00%,0.00%,28,0.18,4m,21s
+randrot_noise_10distinctobj_dist_agent,99.00%,6.00%,48,0.46,5m,29s
+randrot_noise_10distinctobj_dist_on_distm,100.00%,2.00%,36,0.25,4m,28s
+randrot_noise_10distinctobj_surf_agent,100.00%,1.00%,29,0.36,5m,33s
+randrot_10distinctobj_surf_agent,100.00%,0.00%,28,0.38,3m,17s
+randrot_noise_10distinctobj_5lms_dist_agent,100.00%,6.00%,53,0.84,15m,82s
+base_10simobj_surf_agent,93.57%,10.00%,78,0.16,10m,53s
+randrot_noise_10simobj_dist_agent,82.00%,42.00%,189,0.59,16m,119s
+randrot_noise_10simobj_surf_agent,91.00%,31.00%,162,0.42,23m,192s
+randomrot_rawnoise_10distinctobj_surf_agent,72.00%,76.00%,16,1.56,15m,15s
+base_10multi_distinctobj_dist_agent,73.57%,40.00%,27,0.73,1h5m,2s
\ No newline at end of file
diff --git a/benchmarks/results/ycb_77objs.csv b/benchmarks/results/ycb_77objs.csv
index 2831d451..36100199 100644
--- a/benchmarks/results/ycb_77objs.csv
+++ b/benchmarks/results/ycb_77objs.csv
@@ -1,6 +1,6 @@
 Experiment,% Correct,% Used MLH,Num Matching Steps,Rotation Error (radians),Run Time,Episode Run Time (s)
-base_77obj_dist_agent,93.07%,14.72%,86,0.33,1h4m,197s
-base_77obj_surf_agent,98.27%,5.19%,57,0.21,31m,96s
-randrot_noise_77obj_dist_agent,87.01%,29.87%,148,0.69,1h33m,314s
-randrot_noise_77obj_surf_agent,94.81%,19.91%,107,0.61,55m,198s
-randrot_noise_77obj_5lms_dist_agent,84.42%,9.09%,64,1.07,42m,800s
\ No newline at end of file
+base_77obj_dist_agent,93.07%,13.85%,86,0.32,56m,178s
+base_77obj_surf_agent,98.70%,6.49%,56,0.16,35m,104s
+randrot_noise_77obj_dist_agent,87.45%,30.74%,149,0.70,1h17m,264s
+randrot_noise_77obj_surf_agent,96.54%,18.61%,103,0.58,1h3m,225s
+randrot_noise_77obj_5lms_dist_agent,84.42%,9.09%,66,1.08,39m,843s
\ No newline at end of file
diff --git a/benchmarks/results/ycb_unsupervised.csv b/benchmarks/results/ycb_unsupervised.csv
index 063d9110..992478cb 100644
--- a/benchmarks/results/ycb_unsupervised.csv
+++ b/benchmarks/results/ycb_unsupervised.csv
@@ -1,4 +1,4 @@
 Experiment,%Correct - 1st Epoch,% Correct - >1st Epoch,Mean Objects per Graph,Mean Graphs per Object,Run Time,Episode Run Time (s)
-surf_agent_unsupervised_10distinctobj,80.00%,86.67%,1.11,1.11,16m,10s
-surf_agent_unsupervised_10distinctobj_noise,80.00%,67.78%,1.09,2.78,22m,13s
-surf_agent_unsupervised_10simobj,50.00%,76.67%,2.75,2.20,25m,15s
\ No newline at end of file
+surf_agent_unsupervised_10distinctobj,70.00%,83.33%,1.43,1.11,20m,12s
+surf_agent_unsupervised_10distinctobj_noise,70.00%,67.78%,1.19,2.11,24m,15s
+surf_agent_unsupervised_10simobj,40.00%,86.67%,2.60,1.30,28m,17s
\ No newline at end of file
diff --git a/docs/overview/benchmark-experiments.md b/docs/overview/benchmark-experiments.md
index fdd71961..4f859e74 100644
--- a/docs/overview/benchmark-experiments.md
+++ b/docs/overview/benchmark-experiments.md
@@ -51,18 +51,18 @@ The following results are obtained from experiments using the 10-object subsets
 
 | Experiment                                  | % Correct | % Used MLH | Num Matching Steps | Rotation Error (radians) | Run Time | Episode Run Time (s) |
 |---------------------------------------------|-----------|------------|--------------------|--------------------------|----------|----------------------|
-| base_config_10distinctobj_dist_agent        | 99.29%    | 5.00%      | 34                 | 0.27                     | 6m       | 20s                  |
-| base_config_10distinctobj_surf_agent        | 100.00%   | 0.00%      | 28                 | 0.17                     | 4m       | 19s                  |
-| randrot_noise_10distinctobj_dist_agent      | 98.00%    | 6.00%      | 47                 | 0.45                     | 5m       | 31s                  |
-| randrot_noise_10distinctobj_dist_on_distm   | 100.00%   | 2.00%      | 36                 | 0.26                     | 4m       | 28s                  |
-| randrot_noise_10distinctobj_surf_agent      | 99.00%    | 0.00%      | 28                 | 0.33                     | 4m       | 27s                  |
-| randrot_10distinctobj_surf_agent            | 100.00%   | 0.00%      | 29                 | 0.40                     | 3m       | 19s                  |
-| randrot_noise_10distinctobj_5lms_dist_agent | 100.00%   | 7.00%      | 52                 | 0.86                     | 18m      | 86s                  |
-| base_10simobj_surf_agent                    | 95.00%    | 7.86%      | 70                 | 0.16                     | 8m       | 41s                  |
-| randrot_noise_10simobj_dist_agent           | 82.00%    | 40.00%     | 182                | 0.61                     | 16m      | 116s                 |
-| randrot_noise_10simobj_surf_agent           | 90.00%    | 34.00%     | 180                | 0.50                     | 24m      | 203s                 |
-| randomrot_rawnoise_10distinctobj_surf_agent | 73.00%    | 78.00%     | 15                 | 1.54                     | 11m      | 12s                  |
-| base_10multi_distinctobj_dist_agent         | 69.29%    | 47.14%     | 25                 | 0.82                     | 1h6m     | 2s                   | 
+| base_config_10distinctobj_dist_agent        | 99.29%    | 3.57%      | 34                 | 0.24                     | 5m       | 19s                  |
+| base_config_10distinctobj_surf_agent        | 100.00%   | 0.00%      | 28                 | 0.18                     | 4m       | 21s                  |
+| randrot_noise_10distinctobj_dist_agent      | 99.00%    | 6.00%      | 48                 | 0.46                     | 5m       | 29s                  |
+| randrot_noise_10distinctobj_dist_on_distm   | 100.00%   | 2.00%      | 36                 | 0.25                     | 4m       | 28s                  |
+| randrot_noise_10distinctobj_surf_agent      | 100.00%   | 1.00%      | 29                 | 0.36                     | 5m       | 33s                  |
+| randrot_10distinctobj_surf_agent            | 100.00%   | 0.00%      | 28                 | 0.38                     | 3m       | 17s                  |
+| randrot_noise_10distinctobj_5lms_dist_agent | 100.00%   | 6.00%      | 53                 | 0.84                     | 15m      | 82s                  |
+| base_10simobj_surf_agent                    | 93.57%    | 10.00%     | 78                 | 0.16                     | 10m      | 53s                  |
+| randrot_noise_10simobj_dist_agent           | 82.00%    | 42.00%     | 189                | 0.59                     | 16m      | 119s                 |
+| randrot_noise_10simobj_surf_agent           | 91.00%    | 31.00%     | 162                | 0.42                     | 23m      | 192s                 |
+| randomrot_rawnoise_10distinctobj_surf_agent | 72.00%    | 76.00%     | 16                 | 1.56                     | 15m      | 15s                  |
+| base_10multi_distinctobj_dist_agent         | 73.57%    | 40.00%     | 27                 | 0.73                     | 1h5m     | 2s                   |
 
 ## Longer Experiments with all 77 YCB Objects
 
@@ -75,11 +75,11 @@ The following results are obtained from experiments on the entire YCB dataset (7
 
 | Experiment                          | % Correct | % Used MLH | Num Matching Steps | Rotation Error (radians) | Run Time | Episode Run Time (s) |
 |-------------------------------------|-----------|------------|--------------------|--------------------------|----------|----------------------|
-| base_77obj_dist_agent               | 93.07%    | 14.72%     | 86                 | 0.33                     | 1h4m     | 197s                 |
-| base_77obj_surf_agent               | 98.27%    | 5.19%      | 57                 | 0.21                     | 31m      | 96s                  |
-| randrot_noise_77obj_dist_agent      | 87.01%    | 29.87%     | 148                | 0.69                     | 1h33m    | 314s                 |
-| randrot_noise_77obj_surf_agent      | 94.81%    | 19.91%     | 107                | 0.61                     | 55m      | 198s                 |
-| randrot_noise_77obj_5lms_dist_agent | 84.42%    | 9.09%      | 64                 | 1.07                     | 42m      | 800s                 |
+| base_77obj_dist_agent               | 93.07%    | 13.85%     | 86                 | 0.32                     | 56m      | 178s                 |
+| base_77obj_surf_agent               | 98.70%    | 6.49%      | 56                 | 0.16                     | 35m      | 104s                 |
+| randrot_noise_77obj_dist_agent      | 87.45%    | 30.74%     | 149                | 0.70                     | 1h17m    | 264s                 |
+| randrot_noise_77obj_surf_agent      | 96.54%    | 18.61%     | 103                | 0.58                     | 1h3m     | 225s                 |
+| randrot_noise_77obj_5lms_dist_agent | 84.42%    | 9.09%      | 66                 | 1.08                     | 39m      | 843s                 | 
 
 ### Explanation of Some of the Results
 
@@ -111,9 +111,9 @@ An object is classified as detected correctly if the detected object ID is in th
 
 | Experiment                                  | %Correct - 1st Epoch | % Correct - >1st Epoch | Mean Objects per Graph | Mean Graphs per Object | Run Time | Episode Run Time (s) |
 |---------------------------------------------|----------------------|------------------------|------------------------|------------------------|----------|----------------------|
-| surf_agent_unsupervised_10distinctobj       | 80.00%               | 86.67%                 | 1.11                   | 1.11                   | 16m      | 10s                  |
-| surf_agent_unsupervised_10distinctobj_noise | 80.00%               | 67.78%                 | 1.09                   | 2.78                   | 22m      | 13s                  |
-| surf_agent_unsupervised_10simobj            | 50.00%               | 76.67%                 | 2.75                   | 2.20                   | 25m      | 15s                  |
+| surf_agent_unsupervised_10distinctobj       | 70.00%               | 83.33%                 | 1.43                   | 1.11                   | 20m      | 12s                  |
+| surf_agent_unsupervised_10distinctobj_noise | 70.00%               | 67.78%                 | 1.19                   | 2.11                   | 24m      | 15s                  |
+| surf_agent_unsupervised_10simobj            | 40.00%               | 86.67%                 | 2.60                   | 1.30                   | 28m      | 17s                  |
 
 To obtain these results use `print_unsupervised_stats(train_stats, epoch_len=10)` (wandb logging is currently not written for unsupervised stats). Unsupervised, continual learning can, by definition, not be parallelized accross epochs. Therefore these experiments were run without multiprocessing on the laptop (running on cloud CPUs works as well but since these are slower without parallelization these were run on the laptop).
 
@@ -157,14 +157,14 @@ See the [monty_lab project folder](https://github.com/thousandbrainsproject/mont
 
 ### Results
 
-| Experiment                                  | % Correct | % Used MLH | Num Matching Steps | [Rotation Error (radians)]   | Run Time | Episode Run Time (s) |
-|---------------------------------------------|-----------|------------|--------------------|------------------------------|----------|----------------------|
-| randrot_noise_sim_on_scan_monty_world       | 80.00%    | 85.83%     | 437                | 0.94                         | 54m      | 25s                  |
-| world_image_on_scanned_model                | 66.67%    | 87.50%     | 453                | 2.05                         | 16m      | 19s                  |
-| dark_world_image_on_scanned_model           | 43.75%    | 77.08%     | 433                | 1.87                         | 15m      | 18s                  |
-| bright_world_image_on_scanned_model         | 47.92%    | 83.33%     | 457                | 2.16                         | 22m      | 27s                  |
-| hand_intrusion_world_image_on_scanned_model | 54.17%    | 47.92%     | 333                | 1.79                         | 11m      | 13s                  |
-| multi_object_world_image_on_scanned_model   | 41.67%    | 39.58%     | 298                | 1.67                         | 10m      | 12s                  |
+| Experiment                                  | % Correct | % Used MLH | Num Matching Steps | [Rotation Error (radians)] | Run Time | Episode Run Time (s) |
+|---------------------------------------------|-----------|------------|--------------------|----------------------------|----------|----------------------|
+| randrot_noise_sim_on_scan_monty_world       | 80.00%    | 80.83%     | 412                | 0.86                       | 52m      | 24s                  |
+| world_image_on_scanned_model                | 72.92%    | 83.33%     | 442                | 2.15                       | 18m      | 21s                  |
+| dark_world_image_on_scanned_model           | 35.42%    | 83.33%     | 430                | 1.81                       | 16m      | 20s                  |
+| bright_world_image_on_scanned_model         | 43.75%    | 79.17%     | 428                | 1.87                       | 18m      | 22s                  |
+| hand_intrusion_world_image_on_scanned_model | 39.58%    | 56.25%     | 344                | 2.00                       | 8m       | 10s                  |
+| multi_object_world_image_on_scanned_model   | 43.75%    | 52.08%     | 344                | 1.90                       | 11m      | 14s                  | 
 
 **Note that rotation errors are meaningless since no ground truth rotation is provided**