ARKit Scene in Rerun
How to Easily Visualize ARKit’s 3D Indoor Scenes
This tutorial is a guide and provides complete code for visualizing a 3D indoor scene captured using Apple’s ARKit technology with the open-source visualisation tool Rerun.
It also highlights the dataset’s potential in developing immersive AR experiences and enhancing machine learning models for real-world applications, while showcasing Rerun’s visualisation capabilities.
If you’re eager to give the example a try: Try it in browser
Complete Code
Therefore, you’ll learn:
- How to visualize RGB and depth images
- How to visualize 3D meshes
- How to visualize bounding boxes with labels
ARKitScenes Dataset
The ARKitScenes dataset, captured using Apple’s ARKit technology, encompasses a diverse array of indoor scenes.
Every 3D indoor scene contains:
- Colour and Depth Images
- Reconstructed 3D Meshes
- Labelled Bounding Boxes Around Objects
For extracting and visualizing the data, the log_arkit
function is given below. If you want to learn more about the scene structure, the data organisation and structure of scenes are explained here. As this tutorial focuses on the visualisation part, the next section explains the logging methods of Rerun.
def log_arkit(recording_path: Path, include_highres: bool) -> None:
video_id = recording_path.stem
lowres_image_dir = recording_path / "lowres_wide"
image_dir = recording_path / "wide"
lowres_depth_dir = recording_path / "lowres_depth"
depth_dir = recording_path / "highres_depth"
lowres_intrinsics_dir = recording_path / "lowres_wide_intrinsics"
intrinsics_dir = recording_path / "wide_intrinsics"
traj_path = recording_path / "lowres_wide.traj"
depth_filenames = [x.name for x in sorted(lowres_depth_dir.iterdir())]
lowres_frame_ids = [x.split(".png")[0].split("_")[1] for x in depth_filenames]
lowres_frame_ids.sort()
camera_from_world_dict = {}
with open(traj_path, encoding="utf-8") as f:
trajectory = f.readlines()
for line in trajectory:
timestamp, camera_from_world = read_camera_from_world(line)
timestamp = f"{round(float(timestamp), 3):.3f}"
camera_from_world_dict[timestamp] = camera_from_world
rr.log("world", rr.ViewCoordinates.RIGHT_HAND_Z_UP, timeless=True)
ply_path = recording_path / f"{recording_path.stem}_3dod_mesh.ply"
# Extract the 3D mesh data
mesh = trimesh.load(str(ply_path))
# Log the 3D mesh
rr.log(
"world/mesh",
rr.Mesh3D(
vertex_positions=mesh.vertices,
vertex_colors=mesh.visual.vertex_colors,
indices=mesh.faces,
),
timeless=True,
)
# Extract annotation boxes data
bbox_annotations_path = recording_path / f"{recording_path.stem}_3dod_annotation.json"
annotation = load_json(bbox_annotations_path)
# Log the annotated boxes
log_annotated_bboxes(annotation)
print("Processing frames…")
for frame_timestamp in tqdm(lowres_frame_ids):
rr.set_time_seconds("time", float(frame_timestamp))
bgr = cv2.imread(f"{lowres_image_dir}/{video_id}_{frame_timestamp}.png")
rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
depth = cv2.imread(f"{lowres_depth_dir}/{video_id}_{frame_timestamp}.png", cv2.IMREAD_ANYDEPTH)
high_res_exists: bool = (image_dir / f"{video_id}_{frame_timestamp}.png").exists() and include_highres
if frame_timestamp in camera_from_world_dict:
lowres_intri_path = lowres_intrinsics_dir / f"{video_id}_{frame_timestamp}.pincam"
# Log the camera transforms
log_camera(
lowres_intri_path,
frame_timestamp,
camera_from_world_dict,
LOWRES_POSED_ENTITY_PATH,
)
# Log the
rr.log(f"{LOWRES_POSED_ENTITY_PATH}/rgb", rr.Image(rgb).compress(jpeg_quality=95))
rr.log(f"{LOWRES_POSED_ENTITY_PATH}/depth", rr.DepthImage(depth, meter=1000))
def log_annotated_bboxes(annotation: dict[str, Any]) -> None:
"""Logs annotated oriented bounding boxes to Rerun."""
for label_info in annotation["data"]:
uid = label_info["uid"]
label = label_info["label"]
half_size = 0.5 * np.array(label_info["segments"]["obbAligned"]["axesLengths"]).reshape(-1, 3)[0]
centroid = np.array(label_info["segments"]["obbAligned"]["centroid"]).reshape(-1, 3)[0]
rotation = np.array(label_info["segments"]["obbAligned"]["normalizedAxes"]).reshape(3, 3)
rot = R.from_matrix(rotation).inv()
rr.log(
f"world/annotations/box-{uid}-{label}",
rr.Boxes3D(
half_sizes=half_size,
centers=centroid,
rotations=rr.Quaternion(xyzw=rot.as_quat()),
labels=label,
),
timeless=True,
)
def log_camera(intri_path: Path, frame_id: str, poses_from_traj: dict[str, rr.TranslationRotationScale3D], entity_id: str) -> None:
"""Logs camera transform and 3D bounding boxes in the image frame."""
w, h, fx, fy, cx, cy = np.loadtxt(intri_path)
intrinsic = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
camera_from_world = poses_from_traj[frame_id]
# clear previous centroid labels
rr.log(f"{entity_id}/bbox-2D-segments", rr.Clear(recursive=True))
# pathlib makes it easy to get the parent, but log methods requires a string
rr.log(entity_id, rr.Transform3D(transform=camera_from_world))
rr.log(entity_id, rr.Pinhole(image_from_camera=intrinsic, resolution=[w, h]))
def read_camera_from_world(traj_string: str) -> tuple[str, rr.TranslationRotationScale3D]:
"""Reads out camera_from_world transform from trajectory string."""
tokens = traj_string.split() # Split the input string into tokens
assert len(tokens) == 7, f"Input string must have 7 tokens, but found {len(tokens)}."
ts: str = tokens[0] # Extract timestamp from the first token
# Extract rotation from the second to fourth tokens
angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
rotation = R.from_rotvec(np.asarray(angle_axis))
# Extract translation from the fifth to seventh tokens
translation = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])])
# Create tuple in format log_transform3d expects
camera_from_world = rr.TranslationRotationScale3D(
translation, rr.Quaternion(xyzw=rotation.as_quat()), from_parent=True
)
return (ts, camera_from_world)
def find_closest_frame_id(target_id: str, frame_ids: dict[str, Any]) -> str:
"""Finds the closest frame id to the target id."""
target_value = float(target_id)
closest_id = min(frame_ids.keys(), key=lambda x: abs(float(x) - target_value))
return closest_id
def load_json(js_path: Path) -> dict[str, Any]:
with open(js_path, encoding="utf8") as f:
json_data: dict[str, Any] = json.load(f)
return json_data
Logging and Visualizing with Rerun
Rerun is fast, versatile, open-source and easy to use.
Rerun is a visualisation tool, that consists of an SDK and a viewer for logging, visualizing and interacting with multimodal data streams. The SDK provides a simple interface to log timestamped multimodal data, which can then be visualized and interacted with in the Rerun viewer.
Key advantages of Rerun:
- It’s free and open-source
- Supported by an active community
- Usable from C++, Python, and Rust
- Stands out for its speed, and developer-friendly interface
Entities and Components
Rerun uses an Entity Component System architecture pattern in which entities represent generic objects while components describe data associated with those entities.
In our example, we have these entities:
world
entity: includes 3D mesh data (world/mesh
), pinhole camera (world/mesh
), and annotiations (world/annotations
)video
entity: includes RGB images (video/rgb
) and depth images (video/depth
)
You can learn more on this page Entities and Components
.
Log a moving RGB-D camera
To log a moving RGB-D camera, we log four key components: the camera’s intrinsics via a pinhole camera model, its pose or extrinsics, along with the color and depth images.
The camera intrinsics, which define the camera’s lens properties, and the pose, detailing its position and orientation, are logged to create a comprehensive 3D to 2D mapping. Both the RGB and depth images are then logged as child entities, capturing the visual and depth aspects of the scene, respectively. This approach ensures a detailed recording of the camera’s viewpoint and the scene it captures.
# Log Pinhole Camera and its transforms
rr.log("world/camera_lowres", rr.Transform3D(transform=camera_from_world))
rr.log("world/camera_lowres", rr.Pinhole(image_from_camera=intrinsic, resolution=[w, h]))
# Log RGB Image
rr.log("video/rgb", rr.Image(rgb).compress(jpeg_quality=95))
# Log Depth Image
rr.log("video/depth", rr.DepthImage(depth, meter=1000))
Here’s a breakdown of the steps:
- Pinhole camera is utilized for achieving a 3D view and camera perspective through the use of the
Pinhole
andTransform3D
archetypes. - The RGB images are logged as
Image
archetype. - The Depth images are logged as
Depth
archetype.
Log 3D Mesh
The mesh is composed of mesh vertices, indices (i.e., which vertices belong to the same face), and vertex colors.
# ... load mesh data from dataset ...
rr.log(
"world/mesh",
rr.Mesh3D(
vertex_positions=mesh.vertices,
vertex_colors=mesh.visual.vertex_colors,
indices=mesh.faces,
),
timeless=True,
)
Here, the mesh is logged to the world/mesh
entity using Mesh3D
archetype and is marked as timeless, since it does not change in the context of this visualisation.
3D Bounding Boxes
Here we loop through the data and add bounding boxes to all the items found.
# .. load annotation data from dataset ...
for i, label_info in enumerate(annotation["data"]):
rr.log(
f"world/annotations/box-{uid}-{label}",
rr.Boxes3D(
half_sizes=half_size,
centers=centroid,
rotations=rr.Quaternion(xyzw=rot.as_quat()),
labels=label,
colors=colors[i],
),
timeless=True,
)
The bounding boxes are logged as Boxes3D
archetype.
Beyond ARKit Scenes
If you found this article useful and insightful, there’s more!
Similar articles:



I regularly share tutorials on visualisation for computer vision and robotics. Follow me for future updates!
You can get my articles in your inbox. Subscribe here.
Also, you can find me on LinkedIn.
Sources
[1] ARKitScenes dataset by Apple under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License
[2] Rerun Docs by Rerun under MIT license