StanfordVL · hang-yin · Sep 3, 2024 · Sep 4, 2024 · Sep 5, 2024
diff --git a/omnigibson/envs/vec_env_base.py b/omnigibson/envs/vec_env_base.py
@@ -4,6 +4,7 @@
 from tqdm import trange
 
 import omnigibson as og
+from omnigibson.sensors import TiledCamera
 
 
 class VectorEnvironment:
@@ -19,6 +20,8 @@ def __init__(self, num_envs, config):
             for _ in trange(num_envs, desc="Loading environments")
         ]
 
+        self.tiled_camera = TiledCamera(modalities=["rgb", "depth"])
+
         # Play, and finish loading all the envs
         og.sim.play()
         for env in self.envs:
@@ -29,19 +32,30 @@ def step(self, actions):
         for i, action in enumerate(actions):
             self.envs[i]._pre_step(action)
         og.sim.step()
+
+        tiled_buffer = self.tiled_camera.get_obs()
+
+        rgb_tile = tiled_buffer["rgb"].cpu().numpy()
+        depth_tile = tiled_buffer["depth"].cpu().numpy()
+
         for i, action in enumerate(actions):
+            # TODO: ignore camera observation here
+            # TODO: potentially, we could get the tiled image first, segment it, and then replace all the normal camera observations with the segmented tiled image
             obs, reward, terminated, truncated, info = self.envs[i]._post_step(action)
             observations.append(obs)
             rewards.append(reward)
             terminates.append(terminated)
             truncates.append(truncated)
             infos.append(info)
+
         return observations, rewards, terminates, truncates, infos
 
     def reset(self):
         for env in self.envs:
             env.reset()
 
+        # TODO: reset tiled rendering camera
+
     def close(self):
         pass
 

diff --git a/omnigibson/sensors/__init__.py b/omnigibson/sensors/__init__.py
@@ -2,6 +2,7 @@
 from omnigibson.sensors.scan_sensor import ScanSensor
 from omnigibson.sensors.sensor_base import ALL_SENSOR_MODALITIES, REGISTERED_SENSORS, BaseSensor
 from omnigibson.sensors.sensor_noise_base import REGISTERED_SENSOR_NOISES, BaseSensorNoise
+from omnigibson.sensors.tiled_camera import TiledCamera
 from omnigibson.sensors.vision_sensor import VisionSensor
 from omnigibson.utils.python_utils import assert_valid_key
 

diff --git a/omnigibson/sensors/tiled_camera.py b/omnigibson/sensors/tiled_camera.py
@@ -0,0 +1,96 @@
+import math
+
+import torch as th
+
+import omnigibson.lazy as lazy
+from omnigibson.sensors.vision_sensor import VisionSensor
+
+
+class TiledCamera:
+    """
+    Args:
+        modalities (list of str): Modality(s) supported by this sensor. Default is "rgb", can also include "depth".
+    """
+
+    def __init__(
+        self,
+        modalities=["rgb"],
+    ):
+        self.modalities = modalities
+        self._camera_resolution = None
+        camera_prim_paths = []
+        for sensor in VisionSensor.SENSORS.values():
+            if self._camera_resolution == None:
+                self._camera_resolution = (sensor.image_width, sensor.image_height)
+            else:
+                assert self._camera_resolution == (
+                    sensor.image_width,
+                    sensor.image_height,
+                ), "All cameras must have the same resolution!"
+            camera_prim_paths.append(sensor.prim_path)
+        stage = lazy.omni.usd.get_context().get_stage()
+        self._camera_prims = []
+        for path in camera_prim_paths:
+            camera_prim = stage.GetPrimAtPath(path)
+            self._camera_prims.append(lazy.pxr.UsdGeom.Camera(camera_prim))
+        tiled_camera = lazy.omni.replicator.core.create.tiled_sensor(
+            cameras=camera_prim_paths,
+            camera_resolution=self._camera_resolution,
+            tiled_resolution=self._tiled_img_shape(),
+            output_types=self.modalities,
+        )
+        self._render_product_path = lazy.omni.replicator.core.create.render_product(
+            camera=tiled_camera, resolution=self._tiled_img_shape()
+        )
+        self._annotator = lazy.omni.replicator.core.AnnotatorRegistry.get_annotator(
+            "RtxSensorGpu", device="cuda:0", do_array_copy=False
+        )
+        self._annotator.attach([self._render_product_path])
+
+        self._output_buffer = dict()
+        if "rgb" in self.modalities:
+            self._output_buffer["rgb"] = th.zeros(
+                (self._camera_count(), self._camera_resolution[1], self._camera_resolution[0], 3), device="cuda:0"
+            ).contiguous()
+        if "depth" in self.modalities:
+            self._output_buffer["depth"] = th.zeros(
+                (self._camera_count(), self._camera_resolution[1], self._camera_resolution[0], 1), device="cuda:0"
+            ).contiguous()
+
+        super().__init__()
+
+    def _camera_count(self):
+        return len(self._camera_prims)
+
+    def _tiled_grid_shape(self):
+        cols = round(math.sqrt(self._camera_count()))
+        rows = math.ceil(self._camera_count() / cols)
+        return (cols, rows)
+
+    def _tiled_img_shape(self):
+        cols, rows = self._tiled_grid_shape()
+        width, height = self._camera_resolution
+        return (width * cols, height * rows)
+
+    def get_obs(self):
+        # TODO: somehow isaac 4.1.0 introduced a bug: this always return a warp array on cpu instead of gpu, even when explicitly specifying device="cuda:0"
+        tiled_data = self._annotator.get_data().to(device="cuda:0")
+        breakpoint()
+        from omnigibson.utils.deprecated_utils import reshape_tiled_image
+
+        for modality in self.modalities:
+            lazy.warp.launch(
+                kernel=reshape_tiled_image,
+                dim=(self._camera_count(), self._camera_resolution[1], self._camera_resolution[0]),
+                inputs=[
+                    tiled_data,
+                    lazy.warp.from_torch(self._output_buffer[modality]),  # zero-copy alias
+                    *list(self._output_buffer[modality].shape[1:]),  # height, width, num_channels
+                    self._tiled_grid_shape()[0],  # num_tiles_x
+                    (
+                        self._output_buffer["rgb"].numel() if "depth" in self.modalities else 0
+                    ),  # rgb always comes first; needs an offset for depth
+                ],
+                device="cuda:0",
+            )
+        return self._output_buffer
diff --git a/omnigibson/utils/deprecated_utils.py b/omnigibson/utils/deprecated_utils.py
@@ -1029,3 +1029,47 @@ def get_world_pose(fabric_prim):
     result_transform.Orthonormalize()
     result_transform = np.transpose(result_transform)
     return result_transform[:3, 3], R.from_matrix(result_transform[:3, :3]).as_quat()
+
+
+@wp.kernel
+def reshape_tiled_image(
+    tiled_image_buffer: wp.array(dtype=float),
+    batched_image: wp.array(dtype=float, ndim=4),
+    image_height: int,
+    image_width: int,
+    num_channels: int,
+    num_tiles_x: int,
+    offset: int,
+):
+    """Reshapes a tiled image into a batch of images.
+
+    This function reshapes the input tiled image buffer into a batch of images. The input image buffer
+    is assumed to be tiled in the x and y directions. The output image is a batch of images with the
+    specified height, width, and number of channels.
+
+    Args:
+        tiled_image_buffer: The input image buffer. Shape is (height * width * num_channels * num_cameras,).
+        batched_image: The output image. Shape is (num_cameras, height, width, num_channels).
+        image_width: The width of the image.
+        image_height: The height of the image.
+        num_channels: The number of channels in the image.
+        num_tiles_x: The number of tiles in x-direction.
+        offset: The offset in the image buffer. This is used when multiple image types are concatenated in the buffer.
+    """
+    # get the thread id
+    camera_id, height_id, width_id = wp.tid()
+
+    # resolve the tile indices
+    tile_x_id = camera_id % num_tiles_x
+    tile_y_id = camera_id // num_tiles_x
+    # compute the start index of the pixel in the tiled image buffer
+    pixel_start = (
+        offset
+        + num_channels * num_tiles_x * image_width * (image_height * tile_y_id + height_id)
+        + num_channels * tile_x_id * image_width
+        + num_channels * width_id
+    )
+
+    # copy the pixel values into the batched image
+    for i in range(num_channels):
+        batched_image[camera_id, height_id, width_id, i] = tiled_image_buffer[pixel_start + i]